From 1df694b81fc38dff39960ee969e47c07ec6acccf Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 4 Nov 2025 14:48:06 -0800 Subject: [PATCH 1/3] AMDGPU: Stop implementing shouldCoalesce Use the default, which freely coalesces anything it can. This mostly shows improvements, with a handful of regressions. The main concern would be if introducing wider registers is more likely to push the register usage up to the next occupancy tier. --- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 14 - llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 8 - llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 681 +- .../CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll | 232 +- .../CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll | 424 +- .../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll | 808 +- .../AMDGPU/fix-sgpr-copies-nondeterminism.ll | 20 +- llvm/test/CodeGen/AMDGPU/freeze.ll | 144 +- .../AMDGPU/gfx-callable-return-types.ll | 224 +- .../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll | 10 +- .../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll | 56 +- llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 676 +- llvm/test/CodeGen/AMDGPU/load-local-i16.ll | 557 +- llvm/test/CodeGen/AMDGPU/merge-stores.ll | 38 +- .../AMDGPU/mfma-no-register-aliasing.ll | 5 +- .../AMDGPU/schedule-amdgpu-trackers.ll | 2 +- llvm/test/CodeGen/AMDGPU/scratch-simple.ll | 4923 ++--- .../AMDGPU/shufflevector.v2f32.v4f32.ll | 128 +- .../AMDGPU/shufflevector.v2f32.v8f32.ll | 228 +- .../AMDGPU/shufflevector.v2i32.v4i32.ll | 128 +- .../AMDGPU/shufflevector.v2i32.v8i32.ll | 228 +- .../AMDGPU/shufflevector.v2i64.v2i64.ll | 280 +- .../AMDGPU/shufflevector.v2i64.v3i64.ll | 710 +- .../AMDGPU/shufflevector.v2i64.v4i64.ll | 502 +- .../AMDGPU/shufflevector.v2i64.v8i64.ll | 958 +- .../CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll | 280 +- .../CodeGen/AMDGPU/shufflevector.v2p0.v3p0.ll | 710 +- .../CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll | 502 +- .../CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll | 128 +- .../CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll | 228 +- .../AMDGPU/shufflevector.v3f32.v2f32.ll | 7 +- .../AMDGPU/shufflevector.v3f32.v3f32.ll | 1652 +- .../AMDGPU/shufflevector.v3f32.v4f32.ll | 2139 +- .../AMDGPU/shufflevector.v3i32.v2i32.ll | 7 +- .../AMDGPU/shufflevector.v3i32.v3i32.ll | 1652 +- .../AMDGPU/shufflevector.v3i32.v4i32.ll | 2139 +- .../AMDGPU/shufflevector.v3i64.v2i64.ll | 1616 +- .../AMDGPU/shufflevector.v3i64.v3i64.ll | 4231 ++-- .../AMDGPU/shufflevector.v3i64.v4i64.ll | 7707 +++---- .../CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll | 1616 +- .../CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll | 4231 ++-- .../CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll | 7707 +++---- .../CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll | 7 +- .../CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll | 1652 +- .../CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll | 2139 +- .../AMDGPU/shufflevector.v4f32.v2f32.ll | 476 +- .../AMDGPU/shufflevector.v4f32.v3f32.ll | 3451 ++- .../AMDGPU/shufflevector.v4f32.v4f32.ll | 5003 ++--- .../AMDGPU/shufflevector.v4i32.v2i32.ll | 476 +- .../AMDGPU/shufflevector.v4i32.v3i32.ll | 3451 ++- .../AMDGPU/shufflevector.v4i32.v4i32.ll | 5003 ++--- .../AMDGPU/shufflevector.v4i64.v2i64.ll | 2414 +- .../AMDGPU/shufflevector.v4i64.v3i64.ll | 12328 ++++------- .../AMDGPU/shufflevector.v4i64.v4i64.ll | 18146 +++++++--------- .../CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll | 2414 +- .../CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll | 12328 ++++------- .../CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll | 18146 +++++++--------- .../CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll | 476 +- .../CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll | 3451 ++- .../CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll | 5003 ++--- .../AMDGPU/vector-legalizer-divergence.ll | 15 +- .../CodeGen/AMDGPU/widen-vselect-and-mask.ll | 23 +- 62 files changed, 61351 insertions(+), 83587 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 5484fab3efdcc..ad3828fba2187 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -3755,20 +3755,6 @@ bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, return RC && isAGPRClass(RC); } -bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, - const TargetRegisterClass *SrcRC, - unsigned SubReg, - const TargetRegisterClass *DstRC, - unsigned DstSubReg, - const TargetRegisterClass *NewRC, - LiveIntervals &LIS) const { - // TODO: This should be more aggressive, but be more cautious with very wide - // tuples. - unsigned NewSize = getRegSizeInBits(*NewRC); - return NewSize <= 128 || NewSize <= getRegSizeInBits(*SrcRC) || - NewSize <= getRegSizeInBits(*DstRC); -} - unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index bb8a80f811d4c..2e2916f68f584 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -346,14 +346,6 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { ArrayRef getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const; - bool shouldCoalesce(MachineInstr *MI, - const TargetRegisterClass *SrcRC, - unsigned SubReg, - const TargetRegisterClass *DstRC, - unsigned DstSubReg, - const TargetRegisterClass *NewRC, - LiveIntervals &LIS) const override; - unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 3eecaccf0308f..5347110468d9c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -2404,63 +2404,62 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 +; GFX7-NEXT: v_mul_lo_u32 v28, v4, v11 ; GFX7-NEXT: v_mul_lo_u32 v29, v3, v12 -; GFX7-NEXT: v_mul_lo_u32 v30, v2, v13 ; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17] +; GFX7-NEXT: v_mul_lo_u32 v30, v2, v13 +; GFX7-NEXT: v_mul_lo_u32 v27, v5, v10 ; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0 -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v3, v11, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v9, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v4, v10, v[20:21] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v12, 0 -; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v5, v9, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[24:25], s[6:7], v1, v11, v[20:21] -; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v8, v[16:17] -; GFX7-NEXT: v_addc_u32_e32 v28, vcc, 0, v20, vcc -; GFX7-NEXT: v_mad_u64_u32 v[16:17], vcc, v2, v10, v[24:25] -; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GFX7-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GFX7-NEXT: v_mad_u64_u32 v[26:27], s[8:9], v6, v8, v[22:23] -; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v3, v9, v[16:17] -; GFX7-NEXT: v_addc_u32_e32 v16, vcc, 0, v20, vcc -; GFX7-NEXT: v_mad_u64_u32 v[20:21], vcc, v4, v8, v[22:23] -; GFX7-NEXT: v_mov_b32_e32 v22, v26 -; GFX7-NEXT: v_addc_u32_e32 v23, vcc, 0, v16, vcc -; GFX7-NEXT: v_mad_u64_u32 v[16:17], vcc, v0, v13, v[21:22] -; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v0, v11, v[19:20] -; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v1, v12, v[16:17] -; GFX7-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX7-NEXT: v_mul_lo_u32 v25, v5, v10 -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v2, v11, v[19:20] -; GFX7-NEXT: v_mul_lo_u32 v24, v6, v9 -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v4, v9, v[11:12] -; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v10, v[21:22] -; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13] -; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[11:12] -; GFX7-NEXT: v_mul_lo_u32 v10, v1, v14 -; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[14:15], v5, v8, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v3, v11, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v10, 0 +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v4, v10, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v1, v9, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[20:21] +; GFX7-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v8, v[22:23] +; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v6, v8, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v1, v11, v[16:17] +; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v10, v[20:21] +; GFX7-NEXT: v_addc_u32_e64 v20, s[4:5], 0, v24, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v3, v9, v[16:17] +; GFX7-NEXT: v_addc_u32_e64 v16, s[4:5], 0, v20, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v4, v8, v[24:25] +; GFX7-NEXT: v_addc_u32_e64 v24, s[4:5], 0, v16, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v13, v[21:22] +; GFX7-NEXT: v_mul_lo_u32 v25, v6, v9 +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v1, v12, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v2, v11, v[21:22] +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v3, v10, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[14:15], v0, v11, v[19:20] +; GFX7-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[21:22] +; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v26, vcc +; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v1, v10, v[16:17] +; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GFX7-NEXT: v_mad_u64_u32 v[10:11], vcc, v2, v9, v[19:20] ; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[16:17], v0, v8, 0 -; GFX7-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13] -; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v3, v8, v[19:20] -; GFX7-NEXT: v_addc_u32_e64 v5, s[12:13], 0, v2, s[12:13] -; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[12:13], v0, v9, v[17:18] -; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] +; GFX7-NEXT: v_addc_u32_e32 v2, vcc, 0, v6, vcc +; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v8, v[10:11] +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[14:15], v5, v8, v[12:13] +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v2, vcc +; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[16:17], v0, v9, v[17:18] +; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[16:17] ; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v8, v[2:3] -; GFX7-NEXT: v_addc_u32_e64 v3, s[12:13], v4, v21, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], v28, v22, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v5, s[12:13], v5, v13, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v6, s[12:13], v23, v14, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v27, v0, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v10, s[14:15] -; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v30, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v29, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7] -; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v24, vcc +; GFX7-NEXT: v_mad_u64_u32 v[11:12], vcc, v1, v8, v[2:3] +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, v6, v19, vcc +; GFX7-NEXT: v_mul_lo_u32 v10, v1, v14 +; GFX7-NEXT: v_addc_u32_e32 v4, vcc, v4, v20, vcc +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, v5, v21, vcc +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, v24, v22, vcc +; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v23, v0, vcc +; GFX7-NEXT: v_addc_u32_e64 v0, vcc, v0, v10, s[14:15] +; GFX7-NEXT: v_addc_u32_e64 v0, vcc, v0, v30, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v0, vcc, v0, v29, s[10:11] +; GFX7-NEXT: v_addc_u32_e64 v0, vcc, v0, v28, s[8:9] +; GFX7-NEXT: v_addc_u32_e64 v0, vcc, v0, v27, s[6:7] +; GFX7-NEXT: v_addc_u32_e64 v0, vcc, v0, v25, s[4:5] ; GFX7-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v0, v16 ; GFX7-NEXT: v_mov_b32_e32 v1, v11 @@ -2472,63 +2471,62 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 +; GFX8-NEXT: v_mul_lo_u32 v28, v4, v11 ; GFX8-NEXT: v_mul_lo_u32 v29, v3, v12 -; GFX8-NEXT: v_mul_lo_u32 v30, v2, v13 ; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17] +; GFX8-NEXT: v_mul_lo_u32 v30, v2, v13 +; GFX8-NEXT: v_mul_lo_u32 v27, v5, v10 ; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0 -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v3, v11, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v9, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v4, v10, v[20:21] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v12, 0 -; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v5, v9, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[24:25], s[6:7], v1, v11, v[20:21] -; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v8, v[16:17] -; GFX8-NEXT: v_addc_u32_e32 v28, vcc, 0, v20, vcc -; GFX8-NEXT: v_mad_u64_u32 v[16:17], vcc, v2, v10, v[24:25] -; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GFX8-NEXT: v_mad_u64_u32 v[26:27], s[8:9], v6, v8, v[22:23] -; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v3, v9, v[16:17] -; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v20, vcc -; GFX8-NEXT: v_mad_u64_u32 v[20:21], vcc, v4, v8, v[22:23] -; GFX8-NEXT: v_mov_b32_e32 v22, v26 -; GFX8-NEXT: v_addc_u32_e32 v23, vcc, 0, v16, vcc -; GFX8-NEXT: v_mad_u64_u32 v[16:17], vcc, v0, v13, v[21:22] -; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v0, v11, v[19:20] -; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v1, v12, v[16:17] -; GFX8-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX8-NEXT: v_mul_lo_u32 v25, v5, v10 -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v2, v11, v[19:20] -; GFX8-NEXT: v_mul_lo_u32 v24, v6, v9 -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v4, v9, v[11:12] -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v10, v[21:22] -; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13] -; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[11:12] -; GFX8-NEXT: v_mul_lo_u32 v10, v1, v14 -; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[14:15], v5, v8, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v3, v11, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v10, 0 +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v4, v10, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v1, v9, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[20:21] +; GFX8-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v8, v[22:23] +; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v6, v8, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v1, v11, v[16:17] +; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v10, v[20:21] +; GFX8-NEXT: v_addc_u32_e64 v20, s[4:5], 0, v24, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v3, v9, v[16:17] +; GFX8-NEXT: v_addc_u32_e64 v16, s[4:5], 0, v20, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v4, v8, v[24:25] +; GFX8-NEXT: v_addc_u32_e64 v24, s[4:5], 0, v16, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v13, v[21:22] +; GFX8-NEXT: v_mul_lo_u32 v25, v6, v9 +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v1, v12, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v2, v11, v[21:22] +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v3, v10, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[14:15], v0, v11, v[19:20] +; GFX8-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[21:22] +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v26, vcc +; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v1, v10, v[16:17] +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GFX8-NEXT: v_mad_u64_u32 v[10:11], vcc, v2, v9, v[19:20] ; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[16:17], v0, v8, 0 -; GFX8-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13] -; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v3, v8, v[19:20] -; GFX8-NEXT: v_addc_u32_e64 v5, s[12:13], 0, v2, s[12:13] -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[12:13], v0, v9, v[17:18] -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v6, vcc +; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v8, v[10:11] +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[14:15], v5, v8, v[12:13] +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v2, vcc +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[16:17], v0, v9, v[17:18] +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[16:17] ; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v8, v[2:3] -; GFX8-NEXT: v_addc_u32_e64 v3, s[12:13], v4, v21, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], v28, v22, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v5, s[12:13], v5, v13, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v6, s[12:13], v23, v14, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v27, v0, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v10, s[14:15] -; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v30, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v29, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7] -; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v24, vcc +; GFX8-NEXT: v_mad_u64_u32 v[11:12], vcc, v1, v8, v[2:3] +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v6, v19, vcc +; GFX8-NEXT: v_mul_lo_u32 v10, v1, v14 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v20, vcc +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v21, vcc +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v24, v22, vcc +; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v23, v0, vcc +; GFX8-NEXT: v_addc_u32_e64 v0, vcc, v0, v10, s[14:15] +; GFX8-NEXT: v_addc_u32_e64 v0, vcc, v0, v30, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v0, vcc, v0, v29, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v0, vcc, v0, v28, s[8:9] +; GFX8-NEXT: v_addc_u32_e64 v0, vcc, v0, v27, s[6:7] +; GFX8-NEXT: v_addc_u32_e64 v0, vcc, v0, v25, s[4:5] ; GFX8-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, v16 ; GFX8-NEXT: v_mov_b32_e32 v1, v11 @@ -2540,63 +2538,62 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 +; GFX9-NEXT: v_mul_lo_u32 v28, v4, v11 ; GFX9-NEXT: v_mul_lo_u32 v29, v3, v12 -; GFX9-NEXT: v_mul_lo_u32 v30, v2, v13 ; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17] +; GFX9-NEXT: v_mul_lo_u32 v30, v2, v13 +; GFX9-NEXT: v_mul_lo_u32 v27, v5, v10 ; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0 -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v3, v11, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v9, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v4, v10, v[20:21] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v12, 0 -; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v5, v9, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[24:25], s[6:7], v1, v11, v[20:21] -; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v8, v[16:17] -; GFX9-NEXT: v_addc_co_u32_e32 v28, vcc, 0, v20, vcc -; GFX9-NEXT: v_mad_u64_u32 v[16:17], vcc, v2, v10, v[24:25] -; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v20, vcc -; GFX9-NEXT: v_mad_u64_u32 v[26:27], s[8:9], v6, v8, v[22:23] -; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v3, v9, v[16:17] -; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v20, vcc -; GFX9-NEXT: v_mad_u64_u32 v[20:21], vcc, v4, v8, v[22:23] -; GFX9-NEXT: v_mov_b32_e32 v22, v26 -; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v16, vcc -; GFX9-NEXT: v_mad_u64_u32 v[16:17], vcc, v0, v13, v[21:22] -; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v0, v11, v[19:20] -; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v1, v12, v[16:17] -; GFX9-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX9-NEXT: v_mul_lo_u32 v25, v5, v10 -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v2, v11, v[19:20] -; GFX9-NEXT: v_mul_lo_u32 v24, v6, v9 -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v4, v9, v[11:12] -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v10, v[21:22] -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], 0, v4, s[12:13] -; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[11:12] -; GFX9-NEXT: v_mul_lo_u32 v10, v1, v14 -; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[14:15], v5, v8, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v3, v11, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v10, 0 +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v4, v10, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v1, v9, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[20:21] +; GFX9-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v8, v[22:23] +; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v6, v8, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v1, v11, v[16:17] +; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v10, v[20:21] +; GFX9-NEXT: v_addc_co_u32_e64 v20, s[4:5], 0, v24, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v3, v9, v[16:17] +; GFX9-NEXT: v_addc_co_u32_e64 v16, s[4:5], 0, v20, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v4, v8, v[24:25] +; GFX9-NEXT: v_addc_co_u32_e64 v24, s[4:5], 0, v16, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v13, v[21:22] +; GFX9-NEXT: v_mul_lo_u32 v25, v6, v9 +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v1, v12, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v2, v11, v[21:22] +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v3, v10, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[14:15], v0, v11, v[19:20] +; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[21:22] +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v26, vcc +; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v1, v10, v[16:17] +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: v_mad_u64_u32 v[10:11], vcc, v2, v9, v[19:20] ; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[16:17], v0, v8, 0 -; GFX9-NEXT: v_addc_co_u32_e64 v2, s[12:13], 0, v4, s[12:13] -; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v3, v8, v[19:20] -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[12:13], 0, v2, s[12:13] -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[12:13], v0, v9, v[17:18] -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v6, vcc +; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v8, v[10:11] +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[14:15], v5, v8, v[12:13] +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v2, vcc +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[16:17], v0, v9, v[17:18] +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[16:17] ; GFX9-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v8, v[2:3] -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[12:13], v4, v21, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], v28, v22, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[12:13], v5, v13, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[12:13], v23, v14, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v27, v0, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v0, v10, s[14:15] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v30, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v29, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v26, s[6:7] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v25, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v24, vcc +; GFX9-NEXT: v_mad_u64_u32 v[11:12], vcc, v1, v8, v[2:3] +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v19, vcc +; GFX9-NEXT: v_mul_lo_u32 v10, v1, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v20, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v21, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v24, v22, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v23, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v0, vcc, v0, v10, s[14:15] +; GFX9-NEXT: v_addc_co_u32_e64 v0, vcc, v0, v30, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v0, vcc, v0, v29, s[10:11] +; GFX9-NEXT: v_addc_co_u32_e64 v0, vcc, v0, v28, s[8:9] +; GFX9-NEXT: v_addc_co_u32_e64 v0, vcc, v0, v27, s[6:7] +; GFX9-NEXT: v_addc_co_u32_e64 v0, vcc, v0, v25, s[4:5] ; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v16 ; GFX9-NEXT: v_mov_b32_e32 v1, v11 @@ -2611,66 +2608,68 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX10-NEXT: v_mov_b32_e32 v17, v1 ; GFX10-NEXT: v_mov_b32_e32 v18, v2 ; GFX10-NEXT: v_mov_b32_e32 v19, v3 -; GFX10-NEXT: v_mul_lo_u32 v27, v6, v9 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v14, 0 -; GFX10-NEXT: v_mul_lo_u32 v30, v4, v11 -; GFX10-NEXT: v_mul_lo_u32 v28, v5, v10 -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v17, v13, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v18, v12, v[2:3] -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v16, v12, 0 -; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v19, v11, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v11, v[2:3] -; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, 1, s4 -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s5, v4, v10, v[20:21] -; GFX10-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v18, v10, v[0:1] -; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v22, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v5, v9, v[2:3] -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v16, v10, 0 -; GFX10-NEXT: v_mad_u64_u32 v[23:24], vcc_lo, v19, v9, v[20:21] -; GFX10-NEXT: v_mad_u64_u32 v[25:26], s4, v6, v8, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[2:3] -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v22, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v4, v8, v[23:24] -; GFX10-NEXT: v_mov_b32_e32 v23, v25 -; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s4 +; GFX10-NEXT: v_mov_b32_e32 v20, v4 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v16, v14, 0 +; GFX10-NEXT: v_mov_b32_e32 v21, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: v_mov_b32_e32 v22, v7 +; GFX10-NEXT: v_mul_lo_u32 v31, v17, v14 +; GFX10-NEXT: v_mul_lo_u32 v29, v20, v11 +; GFX10-NEXT: v_mul_lo_u32 v30, v16, v15 +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v17, v13, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v16, v12, 0 +; GFX10-NEXT: v_mul_lo_u32 v27, v0, v9 +; GFX10-NEXT: v_mad_u64_u32 v[5:6], s4, v18, v12, v[3:4] +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v17, v11, v[1:2] +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s4 +; GFX10-NEXT: v_mad_u64_u32 v[25:26], s4, v16, v10, 0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s5, v19, v11, v[5:6] +; GFX10-NEXT: v_mad_u64_u32 v[5:6], vcc_lo, v18, v10, v[3:4] +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v20, v10, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v19, v9, v[5:6] +; GFX10-NEXT: v_mad_u64_u32 v[23:24], s4, v21, v9, v[3:4] +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v7, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[4:5], vcc_lo, v20, v8, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[6:7], s4, v0, v8, v[23:24] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[25:26] +; GFX10-NEXT: v_add_co_ci_u32_e32 v25, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4 +; GFX10-NEXT: v_mul_lo_u32 v26, v21, v10 +; GFX10-NEXT: v_mad_u64_u32 v[23:24], vcc_lo, v16, v13, v[5:6] ; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v18, v8, v[0:1] -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v16, v13, v[22:23] -; GFX10-NEXT: v_add_co_ci_u32_e64 v29, s4, 0, v20, s4 -; GFX10-NEXT: v_mov_b32_e32 v20, v3 -; GFX10-NEXT: v_mad_u64_u32 v[22:23], s4, v17, v12, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[24:25], s6, v16, v11, v[20:21] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v16, v8, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 -; GFX10-NEXT: v_mad_u64_u32 v[20:21], s5, v18, v11, v[22:23] -; GFX10-NEXT: v_mul_lo_u32 v22, v16, v15 -; GFX10-NEXT: v_mul_lo_u32 v23, v17, v14 -; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v17, v10, v[24:25] -; GFX10-NEXT: v_mul_lo_u32 v24, v19, v12 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s6, 0, v3, s6 -; GFX10-NEXT: v_mad_u64_u32 v[11:12], s7, v19, v10, v[20:21] -; GFX10-NEXT: v_mul_lo_u32 v25, v18, v13 -; GFX10-NEXT: v_mad_u64_u32 v[20:21], s6, v18, v9, v[14:15] -; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s6, 0, v3, s6 -; GFX10-NEXT: v_mad_u64_u32 v[13:14], s6, v4, v9, v[11:12] -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s8, v16, v9, v[1:2] -; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s8 -; GFX10-NEXT: v_mad_u64_u32 v[9:10], s8, v19, v8, v[20:21] -; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s8, 0, v15, s8 -; GFX10-NEXT: v_mad_u64_u32 v[11:12], s8, v5, v8, v[13:14] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[3:4] -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v16, v9, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v29, v10, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v15, v11, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v6, v12, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v26, v22, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v23, s8 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v25, s6 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v24, s7 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v30, s5 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v28, s4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v27, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[7:8], s4, v7, v8, v[9:10] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s6, v16, v8, 0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v28, s4, 0, v28, s4 +; GFX10-NEXT: v_mad_u64_u32 v[5:6], s5, v17, v12, v[23:24] +; GFX10-NEXT: v_mad_u64_u32 v[23:24], s6, v16, v11, v[3:4] +; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s6 +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v18, v11, v[5:6] +; GFX10-NEXT: v_mad_u64_u32 v[5:6], s6, v17, v10, v[23:24] +; GFX10-NEXT: v_mul_lo_u32 v23, v19, v12 +; GFX10-NEXT: v_mul_lo_u32 v24, v18, v13 +; GFX10-NEXT: v_mad_u64_u32 v[11:12], s7, v19, v10, v[3:4] +; GFX10-NEXT: v_add_co_ci_u32_e64 v10, s6, 0, v14, s6 +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s6, v18, v9, v[5:6] +; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s6, 0, v10, s6 +; GFX10-NEXT: v_mad_u64_u32 v[5:6], s6, v20, v9, v[11:12] +; GFX10-NEXT: v_mad_u64_u32 v[10:11], s8, v16, v9, v[1:2] +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s8 +; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v19, v8, v[3:4] +; GFX10-NEXT: v_add_co_ci_u32_e64 v16, s8, 0, v14, s8 +; GFX10-NEXT: v_mad_u64_u32 v[14:15], s8, v21, v8, v[5:6] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[10:11] +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v9, v12, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v28, v13, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v16, v14, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v25, v15, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s9, v7, v30, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s8, v7, v31, s8 +; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s6, v7, v24, s6 +; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s6, v7, v23, s7 +; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v7, v29, s4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v7, v26, s5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v27, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[7:8], s4, v22, v8, v[7:8] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_mul_i256: @@ -2678,67 +2677,67 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 ; GFX11-NEXT: v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v19, v3 -; GFX11-NEXT: v_dual_mov_b32 v20, v8 :: v_dual_mov_b32 v21, v7 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v14, 0 -; GFX11-NEXT: v_mul_lo_u32 v27, v6, v9 -; GFX11-NEXT: v_mul_lo_u32 v30, v4, v11 -; GFX11-NEXT: v_mul_lo_u32 v31, v17, v14 -; GFX11-NEXT: v_mul_lo_u32 v28, v5, v10 +; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v21, v5 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v16, v14, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v23, v7 +; GFX11-NEXT: v_mov_b32_e32 v22, v8 +; GFX11-NEXT: v_mad_u64_u32 v[26:27], null, v16, v10, 0 +; GFX11-NEXT: v_mul_lo_u32 v28, v0, v9 +; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v17, v13, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v16, v12, 0 +; GFX11-NEXT: v_mul_lo_u32 v30, v20, v11 ; GFX11-NEXT: v_mul_lo_u32 v15, v16, v15 -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v17, v13, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v18, v12, v[2:3] -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v16, v12, 0 -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v19, v11, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v11, v[2:3] -; GFX11-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0 -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v4, v10, v[7:8] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v18, v10, v[0:1] -; GFX11-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v9, v[2:3] -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v16, v10, 0 -; GFX11-NEXT: v_mad_u64_u32 v[25:26], vcc_lo, v19, v9, v[7:8] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v6, v20, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[2:3] -; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v22, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[23:24], vcc_lo, v4, v20, v[25:26] -; GFX11-NEXT: v_mov_b32_e32 v25, v7 -; GFX11-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0 -; GFX11-NEXT: v_mad_u64_u32 v[2:3], s0, v18, v20, v[0:1] -; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v6, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v16, v13, v[24:25] -; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v22, s0 -; GFX11-NEXT: v_mov_b32_e32 v22, v3 -; GFX11-NEXT: v_mad_u64_u32 v[6:7], s0, v17, v12, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[24:25], s2, v16, v11, v[22:23] -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v20, 0 -; GFX11-NEXT: v_mad_u64_u32 v[22:23], s1, v18, v11, v[6:7] -; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v17, v10, v[24:25] +; GFX11-NEXT: v_mul_lo_u32 v14, v17, v14 +; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v18, v12, v[3:4] +; GFX11-NEXT: v_mad_u64_u32 v[3:4], s0, v17, v11, v[1:2] +; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v19, v11, v[5:6] +; GFX11-NEXT: v_mad_u64_u32 v[5:6], vcc_lo, v18, v10, v[3:4] +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v20, v10, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v19, v9, v[5:6] +; GFX11-NEXT: v_mad_u64_u32 v[24:25], null, v21, v9, v[3:4] +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v7, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[4:5], vcc_lo, v20, v22, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v0, v22, v[24:25] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[26:27] +; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 +; GFX11-NEXT: v_mul_lo_u32 v27, v21, v10 +; GFX11-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v16, v13, v[5:6] +; GFX11-NEXT: v_mad_u64_u32 v[2:3], s0, v18, v22, v[0:1] +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v8, s0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v22, 0 +; GFX11-NEXT: v_mad_u64_u32 v[5:6], s1, v17, v12, v[24:25] +; GFX11-NEXT: v_mad_u64_u32 v[24:25], s2, v16, v11, v[3:4] +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s2 +; GFX11-NEXT: v_mad_u64_u32 v[3:4], s0, v18, v11, v[5:6] +; GFX11-NEXT: v_mad_u64_u32 v[5:6], s2, v17, v10, v[24:25] ; GFX11-NEXT: v_mul_lo_u32 v24, v19, v12 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s2 -; GFX11-NEXT: v_mad_u64_u32 v[11:12], s3, v19, v10, v[22:23] -; GFX11-NEXT: v_mul_lo_u32 v22, v18, v13 -; GFX11-NEXT: v_mad_u64_u32 v[13:14], s2, v18, v9, v[6:7] -; GFX11-NEXT: v_add_co_ci_u32_e64 v18, null, 0, v3, s2 -; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v4, v9, v[11:12] -; GFX11-NEXT: v_mad_u64_u32 v[3:4], s4, v16, v9, v[1:2] +; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, 0, v8, s2 +; GFX11-NEXT: v_mul_lo_u32 v25, v18, v13 +; GFX11-NEXT: v_mad_u64_u32 v[11:12], s3, v19, v10, v[3:4] +; GFX11-NEXT: v_mad_u64_u32 v[3:4], s2, v18, v9, v[5:6] +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v8, s2 +; GFX11-NEXT: v_mad_u64_u32 v[5:6], s2, v20, v9, v[11:12] +; GFX11-NEXT: v_mad_u64_u32 v[10:11], s4, v16, v9, v[1:2] ; GFX11-NEXT: v_cndmask_b32_e64 v16, 0, 1, s4 -; GFX11-NEXT: v_mad_u64_u32 v[9:10], s4, v19, v20, v[13:14] -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v18, s4 -; GFX11-NEXT: v_mad_u64_u32 v[11:12], s4, v5, v20, v[6:7] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v20, v[3:4] -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v16, v9, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v10, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v13, v11, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v12, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v8, v15, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v31, s4 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v22, s2 +; GFX11-NEXT: v_mad_u64_u32 v[8:9], s4, v19, v22, v[3:4] +; GFX11-NEXT: v_add_co_ci_u32_e64 v18, null, 0, v13, s4 +; GFX11-NEXT: v_mad_u64_u32 v[12:13], s4, v21, v22, v[5:6] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v22, v[10:11] +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v16, v8, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v9, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v18, v12, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v13, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v15, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v14, s4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v25, s2 ; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v24, s3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v30, s1 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v28, s0 -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, v7, v27, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v21, v20, v[9:10] +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v30, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v27, s1 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, v7, v28, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v23, v22, v[9:10] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_mul_i256: @@ -2750,99 +2749,99 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 ; GFX12-NEXT: v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v19, v3 -; GFX12-NEXT: v_mul_lo_u32 v27, v6, v9 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v14, 0 -; GFX12-NEXT: v_mul_lo_u32 v30, v4, v11 -; GFX12-NEXT: v_mul_lo_u32 v28, v5, v10 -; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v17, v13, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v18, v12, v[2:3] -; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v16, v12, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v19, v11, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v11, v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v21, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v16, v14, 0 +; GFX12-NEXT: v_mov_b32_e32 v0, v6 +; GFX12-NEXT: v_mov_b32_e32 v22, v7 +; GFX12-NEXT: v_mad_co_u64_u32 v[25:26], null, v16, v10, 0 +; GFX12-NEXT: v_mul_lo_u32 v31, v17, v14 +; GFX12-NEXT: v_mul_lo_u32 v27, v0, v9 +; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v17, v13, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v16, v12, 0 +; GFX12-NEXT: v_mul_lo_u32 v29, v20, v11 +; GFX12-NEXT: v_mul_lo_u32 v30, v16, v15 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v18, v12, v[3:4] +; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s0, v17, v11, v[1:2] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0 -; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v4, v10, v[20:21] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v18, v10, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v19, v11, v[5:6] +; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], vcc_lo, v18, v10, v[3:4] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[2:3] -; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v16, v10, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[23:24], vcc_lo, v19, v9, v[20:21] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[25:26], null, v6, v8, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[2:3] +; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v20, v10, v[1:2] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], vcc_lo, v19, v9, v[5:6] +; GFX12-NEXT: v_mad_co_u64_u32 v[23:24], null, v21, v9, v[3:4] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v22, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v4, v8, v[23:24] -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v7, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[4:5], vcc_lo, v20, v8, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[6:7], null, v0, v8, v[23:24] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[25:26] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, vcc_lo -; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], s0, v18, v8, v[0:1] -; GFX12-NEXT: v_mov_b32_e32 v23, v25 +; GFX12-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v3, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v20, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mov_b32_e32 v20, v3 -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], vcc_lo, v16, v13, v[22:23] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], s2, v16, v11, v[20:21] +; GFX12-NEXT: v_cndmask_b32_e64 v28, 0, 1, s0 +; GFX12-NEXT: v_mul_lo_u32 v26, v21, v10 +; GFX12-NEXT: v_mad_co_u64_u32 v[23:24], vcc_lo, v16, v13, v[5:6] +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], s0, v18, v8, v[0:1] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 -; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], s0, v17, v12, v[0:1] +; GFX12-NEXT: v_add_co_ci_u32_e64 v28, null, 0, v28, s0 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], s1, v18, v11, v[22:23] -; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15 -; GFX12-NEXT: v_mul_lo_u32 v23, v17, v14 -; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v17, v10, v[24:25] -; GFX12-NEXT: v_mul_lo_u32 v24, v19, v12 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s1, v17, v12, v[23:24] +; GFX12-NEXT: v_mad_co_u64_u32 v[23:24], s2, v16, v11, v[3:4] +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: v_cndmask_b32_e64 v14, 0, 1, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s0, v18, v11, v[5:6] +; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s2, v17, v10, v[23:24] +; GFX12-NEXT: v_mul_lo_u32 v23, v19, v12 +; GFX12-NEXT: v_mul_lo_u32 v24, v18, v13 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s3, v19, v10, v[3:4] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s2 -; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s3, v19, v10, v[20:21] -; GFX12-NEXT: v_mul_lo_u32 v25, v18, v13 -; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], s2, v18, v9, v[14:15] +; GFX12-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v14, s2 +; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s2, v18, v9, v[5:6] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v3, s2 +; GFX12-NEXT: v_add_co_ci_u32_e64 v14, null, 0, v10, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[13:14], s2, v4, v9, v[11:12] -; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s4, v16, v9, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s2, v20, v9, v[11:12] +; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s4, v16, v9, v[1:2] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v16, 0, 1, s4 -; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], s4, v19, v8, v[20:21] +; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4 +; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v19, v8, v[3:4] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, s4 -; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s4, v5, v8, v[13:14] -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[3:4] +; GFX12-NEXT: v_add_co_ci_u32_e64 v16, null, 0, v14, s4 +; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s4, v21, v8, v[5:6] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[10:11] ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v16, v9, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v12, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v10, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v28, v13, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v15, v11, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v16, v14, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v6, v12, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v25, v15, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v26, v22, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v30, s5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v23, s4 -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v25, s2 +; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v31, s4 +; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v24, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v24, s3 -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v30, s1 +; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v23, s3 +; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v29, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, s0 +; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v26, s1 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v27, vcc_lo +; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v27, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10] +; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v22, v8, v[7:8] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_mul_i256: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll index 231460f584a2e..a498525c92360 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll @@ -2853,52 +2853,50 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 -; SI-NEXT: v_lshr_b64 v[0:1], v[10:11], 16 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_lshr_b64 v[1:2], v[8:9], 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v13 -; SI-NEXT: v_lshr_b64 v[16:17], v[4:5], 16 -; SI-NEXT: v_lshr_b64 v[2:3], v[6:7], 16 -; SI-NEXT: v_mov_b32_e32 v3, v16 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_lshr_b64 v[0:1], v[11:12], 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v15 +; SI-NEXT: v_lshr_b64 v[1:2], v[9:10], 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v14 +; SI-NEXT: v_lshr_b64 v[2:3], v[7:8], 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v13 +; SI-NEXT: v_lshr_b64 v[3:4], v[5:6], 16 ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 ; SI-NEXT: .LBB23_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: @@ -7396,52 +7394,50 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 -; SI-NEXT: v_lshr_b64 v[0:1], v[10:11], 16 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_lshr_b64 v[1:2], v[8:9], 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v13 -; SI-NEXT: v_lshr_b64 v[16:17], v[4:5], 16 -; SI-NEXT: v_lshr_b64 v[2:3], v[6:7], 16 -; SI-NEXT: v_mov_b32_e32 v3, v16 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_lshr_b64 v[0:1], v[11:12], 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v15 +; SI-NEXT: v_lshr_b64 v[1:2], v[9:10], 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v14 +; SI-NEXT: v_lshr_b64 v[2:3], v[7:8], 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v13 +; SI-NEXT: v_lshr_b64 v[3:4], v[5:6], 16 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 ; SI-NEXT: .LBB47_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: @@ -11589,52 +11585,50 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 -; SI-NEXT: v_lshr_b64 v[0:1], v[10:11], 16 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_lshr_b64 v[1:2], v[8:9], 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v13 -; SI-NEXT: v_lshr_b64 v[16:17], v[4:5], 16 -; SI-NEXT: v_lshr_b64 v[2:3], v[6:7], 16 -; SI-NEXT: v_mov_b32_e32 v3, v16 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_lshr_b64 v[0:1], v[11:12], 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v15 +; SI-NEXT: v_lshr_b64 v[1:2], v[9:10], 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v14 +; SI-NEXT: v_lshr_b64 v[2:3], v[7:8], 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v13 +; SI-NEXT: v_lshr_b64 v[3:4], v[5:6], 16 ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 ; SI-NEXT: .LBB67_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB67_4: @@ -15361,52 +15355,50 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 -; SI-NEXT: v_lshr_b64 v[0:1], v[10:11], 16 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_lshr_b64 v[1:2], v[8:9], 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v13 -; SI-NEXT: v_lshr_b64 v[16:17], v[4:5], 16 -; SI-NEXT: v_lshr_b64 v[2:3], v[6:7], 16 -; SI-NEXT: v_mov_b32_e32 v3, v16 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_lshr_b64 v[0:1], v[11:12], 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v15 +; SI-NEXT: v_lshr_b64 v[1:2], v[9:10], 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v14 +; SI-NEXT: v_lshr_b64 v[2:3], v[7:8], 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v13 +; SI-NEXT: v_lshr_b64 v[3:4], v[5:6], 16 ; SI-NEXT: s_cbranch_execnz .LBB83_3 ; SI-NEXT: .LBB83_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 ; SI-NEXT: .LBB83_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB83_4: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll index 155ec568a65d3..b846e0ee0a12f 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll @@ -4052,92 +4052,90 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v31, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v28, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v27, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v0 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 -; SI-NEXT: v_lshr_b64 v[0:1], v[22:23], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[20:21], 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 -; SI-NEXT: v_lshr_b64 v[2:3], v[18:19], 16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v28 -; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v27 -; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v24 -; SI-NEXT: v_lshr_b64 v[5:6], v[12:13], 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v25 -; SI-NEXT: v_lshr_b64 v[32:33], v[8:9], 16 -; SI-NEXT: v_lshr_b64 v[6:7], v[10:11], 16 -; SI-NEXT: v_mov_b32_e32 v7, v32 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_lshr_b64 v[0:1], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[21:22], 16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v30 +; SI-NEXT: v_lshr_b64 v[2:3], v[19:20], 16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v29 +; SI-NEXT: v_lshr_b64 v[3:4], v[17:18], 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v28 +; SI-NEXT: v_lshr_b64 v[4:5], v[15:16], 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v27 +; SI-NEXT: v_lshr_b64 v[5:6], v[13:14], 16 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v26 +; SI-NEXT: v_lshr_b64 v[6:7], v[11:12], 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v25 +; SI-NEXT: v_lshr_b64 v[7:8], v[9:10], 16 ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v29 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v28 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v27 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v26 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v24 ; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v25 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 ; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 -; SI-NEXT: v_mov_b32_e32 v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 ; SI-NEXT: .LBB23_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: @@ -11209,92 +11207,90 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v31, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v28, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v27, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v0 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 -; SI-NEXT: v_lshr_b64 v[0:1], v[22:23], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[20:21], 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 -; SI-NEXT: v_lshr_b64 v[2:3], v[18:19], 16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v28 -; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v27 -; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v24 -; SI-NEXT: v_lshr_b64 v[5:6], v[12:13], 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v25 -; SI-NEXT: v_lshr_b64 v[32:33], v[8:9], 16 -; SI-NEXT: v_lshr_b64 v[6:7], v[10:11], 16 -; SI-NEXT: v_mov_b32_e32 v7, v32 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_lshr_b64 v[0:1], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[21:22], 16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v30 +; SI-NEXT: v_lshr_b64 v[2:3], v[19:20], 16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v29 +; SI-NEXT: v_lshr_b64 v[3:4], v[17:18], 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v28 +; SI-NEXT: v_lshr_b64 v[4:5], v[15:16], 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v27 +; SI-NEXT: v_lshr_b64 v[5:6], v[13:14], 16 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v26 +; SI-NEXT: v_lshr_b64 v[6:7], v[11:12], 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v25 +; SI-NEXT: v_lshr_b64 v[7:8], v[9:10], 16 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v29 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v28 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v27 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v26 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v24 ; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v25 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 ; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 -; SI-NEXT: v_mov_b32_e32 v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 ; SI-NEXT: .LBB47_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: @@ -17934,92 +17930,90 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v31, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v28, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v27, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v0 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 -; SI-NEXT: v_lshr_b64 v[0:1], v[22:23], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[20:21], 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 -; SI-NEXT: v_lshr_b64 v[2:3], v[18:19], 16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v28 -; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v27 -; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v24 -; SI-NEXT: v_lshr_b64 v[5:6], v[12:13], 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v25 -; SI-NEXT: v_lshr_b64 v[32:33], v[8:9], 16 -; SI-NEXT: v_lshr_b64 v[6:7], v[10:11], 16 -; SI-NEXT: v_mov_b32_e32 v7, v32 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_lshr_b64 v[0:1], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[21:22], 16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v30 +; SI-NEXT: v_lshr_b64 v[2:3], v[19:20], 16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v29 +; SI-NEXT: v_lshr_b64 v[3:4], v[17:18], 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v28 +; SI-NEXT: v_lshr_b64 v[4:5], v[15:16], 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v27 +; SI-NEXT: v_lshr_b64 v[5:6], v[13:14], 16 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v26 +; SI-NEXT: v_lshr_b64 v[6:7], v[11:12], 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v25 +; SI-NEXT: v_lshr_b64 v[7:8], v[9:10], 16 ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v29 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v28 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v27 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v26 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v24 ; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v25 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 ; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 -; SI-NEXT: v_mov_b32_e32 v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 ; SI-NEXT: .LBB67_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB67_4: @@ -24107,92 +24101,90 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg % ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v31, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v28, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v27, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v0 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 -; SI-NEXT: v_lshr_b64 v[0:1], v[22:23], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[20:21], 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 -; SI-NEXT: v_lshr_b64 v[2:3], v[18:19], 16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v28 -; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v27 -; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v24 -; SI-NEXT: v_lshr_b64 v[5:6], v[12:13], 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v25 -; SI-NEXT: v_lshr_b64 v[32:33], v[8:9], 16 -; SI-NEXT: v_lshr_b64 v[6:7], v[10:11], 16 -; SI-NEXT: v_mov_b32_e32 v7, v32 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_lshr_b64 v[0:1], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[21:22], 16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v30 +; SI-NEXT: v_lshr_b64 v[2:3], v[19:20], 16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v29 +; SI-NEXT: v_lshr_b64 v[3:4], v[17:18], 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v28 +; SI-NEXT: v_lshr_b64 v[4:5], v[15:16], 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v27 +; SI-NEXT: v_lshr_b64 v[5:6], v[13:14], 16 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v26 +; SI-NEXT: v_lshr_b64 v[6:7], v[11:12], 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v25 +; SI-NEXT: v_lshr_b64 v[7:8], v[9:10], 16 ; SI-NEXT: s_cbranch_execnz .LBB83_3 ; SI-NEXT: .LBB83_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v29 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v28 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v27 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v26 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v24 ; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v25 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 ; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 -; SI-NEXT: v_mov_b32_e32 v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 ; SI-NEXT: .LBB83_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB83_4: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index a7f89579b5ce0..9c05297f7bcae 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -6594,176 +6594,174 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mul_f32_e64 v60, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v59, 1.0, s19 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v63, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v62, 1.0, s23 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17 +; SI-NEXT: v_mul_f32_e64 v41, 1.0, s21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s23 ; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v58, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v16 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v62 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v59 -; SI-NEXT: v_lshr_b64 v[0:1], v[54:55], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[39:40], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v41 +; SI-NEXT: v_lshr_b64 v[2:3], v[52:53], 16 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 -; SI-NEXT: v_lshr_b64 v[2:3], v[50:51], 16 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v62 -; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v61 -; SI-NEXT: v_lshr_b64 v[4:5], v[38:39], 16 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v58 -; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v57 -; SI-NEXT: v_lshr_b64 v[6:7], v[34:35], 16 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v56 -; SI-NEXT: v_lshr_b64 v[7:8], v[32:33], 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v47 -; SI-NEXT: v_lshr_b64 v[8:9], v[30:31], 16 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v46 -; SI-NEXT: v_lshr_b64 v[9:10], v[28:29], 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v45 -; SI-NEXT: v_lshr_b64 v[10:11], v[26:27], 16 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v44 -; SI-NEXT: v_lshr_b64 v[11:12], v[24:25], 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v43 -; SI-NEXT: v_lshr_b64 v[12:13], v[22:23], 16 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v42 -; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v41 -; SI-NEXT: v_lshr_b64 v[14:15], v[18:19], 16 -; SI-NEXT: v_mov_b32_e32 v15, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_lshr_b64 v[39:40], v[16:17], 16 -; SI-NEXT: v_mov_b32_e32 v17, v15 -; SI-NEXT: v_mov_b32_e32 v15, v39 +; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v61 +; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v59 +; SI-NEXT: v_lshr_b64 v[5:6], v[37:38], 16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v58 +; SI-NEXT: v_lshr_b64 v[6:7], v[35:36], 16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v57 +; SI-NEXT: v_lshr_b64 v[7:8], v[33:34], 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v56 +; SI-NEXT: v_lshr_b64 v[8:9], v[31:32], 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v47 +; SI-NEXT: v_lshr_b64 v[9:10], v[29:30], 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v46 +; SI-NEXT: v_lshr_b64 v[10:11], v[27:28], 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v45 +; SI-NEXT: v_lshr_b64 v[11:12], v[25:26], 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v44 +; SI-NEXT: v_lshr_b64 v[12:13], v[23:24], 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v43 +; SI-NEXT: v_lshr_b64 v[13:14], v[21:22], 16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_lshr_b64 v[14:15], v[19:20], 16 +; SI-NEXT: v_mov_b32_e32 v20, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; SI-NEXT: v_lshr_b64 v[15:16], v[17:18], 16 +; SI-NEXT: v_mov_b32_e32 v18, v20 ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v62 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v58 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v57 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v56 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v47 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v45 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v44 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v43 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v42 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v41 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 ; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 -; SI-NEXT: v_mov_b32_e32 v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 ; SI-NEXT: .LBB23_3: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -21622,176 +21620,174 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mul_f32_e64 v60, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v59, 1.0, s19 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v63, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v62, 1.0, s23 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17 +; SI-NEXT: v_mul_f32_e64 v41, 1.0, s21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s23 ; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v58, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v16 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v62 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v59 -; SI-NEXT: v_lshr_b64 v[0:1], v[54:55], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[39:40], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v41 +; SI-NEXT: v_lshr_b64 v[2:3], v[52:53], 16 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 -; SI-NEXT: v_lshr_b64 v[2:3], v[50:51], 16 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v62 -; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v61 -; SI-NEXT: v_lshr_b64 v[4:5], v[38:39], 16 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v58 -; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v57 -; SI-NEXT: v_lshr_b64 v[6:7], v[34:35], 16 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v56 -; SI-NEXT: v_lshr_b64 v[7:8], v[32:33], 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v47 -; SI-NEXT: v_lshr_b64 v[8:9], v[30:31], 16 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v46 -; SI-NEXT: v_lshr_b64 v[9:10], v[28:29], 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v45 -; SI-NEXT: v_lshr_b64 v[10:11], v[26:27], 16 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v44 -; SI-NEXT: v_lshr_b64 v[11:12], v[24:25], 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v43 -; SI-NEXT: v_lshr_b64 v[12:13], v[22:23], 16 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v42 -; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v41 -; SI-NEXT: v_lshr_b64 v[14:15], v[18:19], 16 -; SI-NEXT: v_mov_b32_e32 v15, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_lshr_b64 v[39:40], v[16:17], 16 -; SI-NEXT: v_mov_b32_e32 v17, v15 -; SI-NEXT: v_mov_b32_e32 v15, v39 +; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v61 +; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v59 +; SI-NEXT: v_lshr_b64 v[5:6], v[37:38], 16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v58 +; SI-NEXT: v_lshr_b64 v[6:7], v[35:36], 16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v57 +; SI-NEXT: v_lshr_b64 v[7:8], v[33:34], 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v56 +; SI-NEXT: v_lshr_b64 v[8:9], v[31:32], 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v47 +; SI-NEXT: v_lshr_b64 v[9:10], v[29:30], 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v46 +; SI-NEXT: v_lshr_b64 v[10:11], v[27:28], 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v45 +; SI-NEXT: v_lshr_b64 v[11:12], v[25:26], 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v44 +; SI-NEXT: v_lshr_b64 v[12:13], v[23:24], 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v43 +; SI-NEXT: v_lshr_b64 v[13:14], v[21:22], 16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_lshr_b64 v[14:15], v[19:20], 16 +; SI-NEXT: v_mov_b32_e32 v20, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; SI-NEXT: v_lshr_b64 v[15:16], v[17:18], 16 +; SI-NEXT: v_mov_b32_e32 v18, v20 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v62 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v58 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v57 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v56 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v47 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v45 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v44 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v43 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v42 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v41 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 ; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 -; SI-NEXT: v_mov_b32_e32 v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 ; SI-NEXT: .LBB47_3: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -36198,176 +36194,174 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mul_f32_e64 v60, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v59, 1.0, s19 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v63, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v62, 1.0, s23 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17 +; SI-NEXT: v_mul_f32_e64 v41, 1.0, s21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s23 ; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v58, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v16 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v62 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v59 -; SI-NEXT: v_lshr_b64 v[0:1], v[54:55], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[39:40], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v41 +; SI-NEXT: v_lshr_b64 v[2:3], v[52:53], 16 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 -; SI-NEXT: v_lshr_b64 v[2:3], v[50:51], 16 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v62 -; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v61 -; SI-NEXT: v_lshr_b64 v[4:5], v[38:39], 16 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v58 -; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v57 -; SI-NEXT: v_lshr_b64 v[6:7], v[34:35], 16 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v56 -; SI-NEXT: v_lshr_b64 v[7:8], v[32:33], 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v47 -; SI-NEXT: v_lshr_b64 v[8:9], v[30:31], 16 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v46 -; SI-NEXT: v_lshr_b64 v[9:10], v[28:29], 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v45 -; SI-NEXT: v_lshr_b64 v[10:11], v[26:27], 16 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v44 -; SI-NEXT: v_lshr_b64 v[11:12], v[24:25], 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v43 -; SI-NEXT: v_lshr_b64 v[12:13], v[22:23], 16 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v42 -; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v41 -; SI-NEXT: v_lshr_b64 v[14:15], v[18:19], 16 -; SI-NEXT: v_mov_b32_e32 v15, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_lshr_b64 v[39:40], v[16:17], 16 -; SI-NEXT: v_mov_b32_e32 v17, v15 -; SI-NEXT: v_mov_b32_e32 v15, v39 +; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v61 +; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v59 +; SI-NEXT: v_lshr_b64 v[5:6], v[37:38], 16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v58 +; SI-NEXT: v_lshr_b64 v[6:7], v[35:36], 16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v57 +; SI-NEXT: v_lshr_b64 v[7:8], v[33:34], 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v56 +; SI-NEXT: v_lshr_b64 v[8:9], v[31:32], 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v47 +; SI-NEXT: v_lshr_b64 v[9:10], v[29:30], 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v46 +; SI-NEXT: v_lshr_b64 v[10:11], v[27:28], 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v45 +; SI-NEXT: v_lshr_b64 v[11:12], v[25:26], 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v44 +; SI-NEXT: v_lshr_b64 v[12:13], v[23:24], 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v43 +; SI-NEXT: v_lshr_b64 v[13:14], v[21:22], 16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_lshr_b64 v[14:15], v[19:20], 16 +; SI-NEXT: v_mov_b32_e32 v20, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; SI-NEXT: v_lshr_b64 v[15:16], v[17:18], 16 +; SI-NEXT: v_mov_b32_e32 v18, v20 ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v62 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v58 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v57 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v56 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v47 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v45 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v44 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v43 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v42 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v41 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 ; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 -; SI-NEXT: v_mov_b32_e32 v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 ; SI-NEXT: .LBB67_3: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -49794,176 +49788,174 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mul_f32_e64 v60, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v59, 1.0, s19 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v63, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v62, 1.0, s23 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17 +; SI-NEXT: v_mul_f32_e64 v41, 1.0, s21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s23 ; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v58, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v16 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v62 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v59 -; SI-NEXT: v_lshr_b64 v[0:1], v[54:55], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[39:40], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v41 +; SI-NEXT: v_lshr_b64 v[2:3], v[52:53], 16 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 -; SI-NEXT: v_lshr_b64 v[2:3], v[50:51], 16 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v62 -; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v61 -; SI-NEXT: v_lshr_b64 v[4:5], v[38:39], 16 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v58 -; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v57 -; SI-NEXT: v_lshr_b64 v[6:7], v[34:35], 16 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v56 -; SI-NEXT: v_lshr_b64 v[7:8], v[32:33], 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v47 -; SI-NEXT: v_lshr_b64 v[8:9], v[30:31], 16 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v46 -; SI-NEXT: v_lshr_b64 v[9:10], v[28:29], 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v45 -; SI-NEXT: v_lshr_b64 v[10:11], v[26:27], 16 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v44 -; SI-NEXT: v_lshr_b64 v[11:12], v[24:25], 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v43 -; SI-NEXT: v_lshr_b64 v[12:13], v[22:23], 16 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v42 -; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v41 -; SI-NEXT: v_lshr_b64 v[14:15], v[18:19], 16 -; SI-NEXT: v_mov_b32_e32 v15, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_lshr_b64 v[39:40], v[16:17], 16 -; SI-NEXT: v_mov_b32_e32 v17, v15 -; SI-NEXT: v_mov_b32_e32 v15, v39 +; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v61 +; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v59 +; SI-NEXT: v_lshr_b64 v[5:6], v[37:38], 16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v58 +; SI-NEXT: v_lshr_b64 v[6:7], v[35:36], 16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v57 +; SI-NEXT: v_lshr_b64 v[7:8], v[33:34], 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v56 +; SI-NEXT: v_lshr_b64 v[8:9], v[31:32], 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v47 +; SI-NEXT: v_lshr_b64 v[9:10], v[29:30], 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v46 +; SI-NEXT: v_lshr_b64 v[10:11], v[27:28], 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v45 +; SI-NEXT: v_lshr_b64 v[11:12], v[25:26], 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v44 +; SI-NEXT: v_lshr_b64 v[12:13], v[23:24], 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v43 +; SI-NEXT: v_lshr_b64 v[13:14], v[21:22], 16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_lshr_b64 v[14:15], v[19:20], 16 +; SI-NEXT: v_mov_b32_e32 v20, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; SI-NEXT: v_lshr_b64 v[15:16], v[17:18], 16 +; SI-NEXT: v_mov_b32_e32 v18, v20 ; SI-NEXT: s_cbranch_execnz .LBB83_3 ; SI-NEXT: .LBB83_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v62 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v58 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v57 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v56 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v47 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v45 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v44 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v43 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v42 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v41 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 ; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 -; SI-NEXT: v_mov_b32_e32 v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 ; SI-NEXT: .LBB83_3: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll index 1e469b1951009..4196a9056a521 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll @@ -8,25 +8,21 @@ define amdgpu_gs void @f(i32 inreg %arg, i32 %arg1, i32 %arg2) { ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %bb3 -; CHECK-NEXT: v_mov_b32_e32 v5, v0 +; CHECK-NEXT: v_mov_b32_e32 v2, v0 ; CHECK-NEXT: s_branch .LBB0_3 ; CHECK-NEXT: .LBB0_2: -; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: v_mov_b32_e32 v5, 1 +; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 1 ; CHECK-NEXT: .LBB0_3: ; %bb4 -; CHECK-NEXT: v_mov_b32_e32 v6, 0 +; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: s_mov_b32 s1, s0 ; CHECK-NEXT: s_mov_b32 s2, s0 ; CHECK-NEXT: s_mov_b32 s3, s0 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) -; CHECK-NEXT: v_mov_b32_e32 v7, v6 -; CHECK-NEXT: v_mov_b32_e32 v8, v6 -; CHECK-NEXT: v_mov_b32_e32 v2, v6 -; CHECK-NEXT: v_mov_b32_e32 v3, v6 -; CHECK-NEXT: v_mov_b32_e32 v4, v6 -; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: buffer_store_b128 v[5:8], v6, s[0:3], 0 idxen -; CHECK-NEXT: buffer_store_b128 v[1:4], v6, s[0:3], 0 idxen +; CHECK-NEXT: v_mov_b32_e32 v4, v3 +; CHECK-NEXT: v_mov_b32_e32 v5, v3 +; CHECK-NEXT: buffer_store_b128 v[2:5], v3, s[0:3], 0 idxen +; CHECK-NEXT: v_mov_b32_e32 v2, v3 +; CHECK-NEXT: buffer_store_b128 v[1:4], v3, s[0:3], 0 idxen ; CHECK-NEXT: s_endpgm bb: %i = icmp eq i32 %arg, 0 diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll index 308e86bbaf8fd..7df250d1fc1b4 100644 --- a/llvm/test/CodeGen/AMDGPU/freeze.ll +++ b/llvm/test/CodeGen/AMDGPU/freeze.ll @@ -8203,14 +8203,12 @@ define void @freeze_v3p0(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX6-GISEL-NEXT: s_mov_b32 s6, 0 ; GFX6-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-GISEL-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16 -; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 +; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 +; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16 ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX6-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-GISEL-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX6-GISEL-NEXT: buffer_store_dwordx4 v[6:9], v[2:3], s[4:7], 0 addr64 -; GFX6-GISEL-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:16 +; GFX6-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX6-GISEL-NEXT: buffer_store_dwordx2 v[8:9], v[2:3], s[4:7], 0 addr64 offset:16 ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -8236,47 +8234,40 @@ define void @freeze_v3p0(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX7-GISEL-NEXT: s_mov_b32 s6, 0 ; GFX7-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-GISEL-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-GISEL-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16 -; GFX7-GISEL-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 +; GFX7-GISEL-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 +; GFX7-GISEL-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16 ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: buffer_store_dwordx4 v[6:9], v[2:3], s[4:7], 0 addr64 -; GFX7-GISEL-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:16 +; GFX7-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX7-GISEL-NEXT: buffer_store_dwordx2 v[8:9], v[2:3], s[4:7], 0 addr64 offset:16 ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: freeze_v3p0: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 16, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 16, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(1) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] -; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 16, v2 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v8 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v9 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[8:9] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: freeze_v3p0: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 -; GFX9-GISEL-NEXT: global_load_dwordx4 v[6:9], v[0:1], off -; GFX9-GISEL-NEXT: ; kill: killed $vgpr0 killed $vgpr1 +; GFX9-GISEL-NEXT: global_load_dwordx4 v[4:7], v[0:1], off +; GFX9-GISEL-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_store_dwordx4 v[2:3], v[6:9], off -; GFX9-GISEL-NEXT: global_store_dwordx2 v[2:3], v[0:1], off offset:16 +; GFX9-GISEL-NEXT: global_store_dwordx4 v[2:3], v[4:7], off +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX9-GISEL-NEXT: global_store_dwordx2 v[2:3], v[8:9], off offset:16 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -8296,15 +8287,12 @@ define void @freeze_v3p0(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 -; GFX10-GISEL-NEXT: global_load_dwordx4 v[6:9], v[0:1], off -; GFX10-GISEL-NEXT: ; kill: killed $vgpr0 killed $vgpr1 +; GFX10-GISEL-NEXT: global_load_dwordx4 v[4:7], v[0:1], off +; GFX10-GISEL-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[2:3], v[4:7], off ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: global_store_dwordx4 v[2:3], v[6:9], off -; GFX10-GISEL-NEXT: global_store_dwordx2 v[2:3], v[0:1], off offset:16 +; GFX10-GISEL-NEXT: global_store_dwordx2 v[2:3], v[8:9], off offset:16 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: freeze_v3p0: @@ -8323,14 +8311,12 @@ define void @freeze_v3p0(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16 -; GFX11-GISEL-NEXT: global_load_b128 v[6:9], v[0:1], off +; GFX11-GISEL-NEXT: global_load_b128 v[4:7], v[0:1], off +; GFX11-GISEL-NEXT: global_load_b128 v[8:11], v[0:1], off offset:16 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-GISEL-NEXT: global_store_b128 v[2:3], v[4:7], off ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: global_store_b128 v[2:3], v[6:9], off -; GFX11-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off offset:16 +; GFX11-GISEL-NEXT: global_store_b64 v[2:3], v[8:9], off offset:16 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %a = load <3 x ptr>, ptr addrspace(1) %ptra %freeze = freeze <3 x ptr> %a @@ -9251,14 +9237,12 @@ define void @freeze_v3p1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX6-GISEL-NEXT: s_mov_b32 s6, 0 ; GFX6-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-GISEL-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16 -; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 +; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 +; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16 ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX6-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-GISEL-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX6-GISEL-NEXT: buffer_store_dwordx4 v[6:9], v[2:3], s[4:7], 0 addr64 -; GFX6-GISEL-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:16 +; GFX6-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX6-GISEL-NEXT: buffer_store_dwordx2 v[8:9], v[2:3], s[4:7], 0 addr64 offset:16 ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -9284,47 +9268,40 @@ define void @freeze_v3p1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX7-GISEL-NEXT: s_mov_b32 s6, 0 ; GFX7-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-GISEL-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-GISEL-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16 -; GFX7-GISEL-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 +; GFX7-GISEL-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 +; GFX7-GISEL-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16 ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: buffer_store_dwordx4 v[6:9], v[2:3], s[4:7], 0 addr64 -; GFX7-GISEL-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:16 +; GFX7-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX7-GISEL-NEXT: buffer_store_dwordx2 v[8:9], v[2:3], s[4:7], 0 addr64 offset:16 ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: freeze_v3p1: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 16, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 16, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(1) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] -; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 16, v2 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v8 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v9 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[8:9] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: freeze_v3p1: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 -; GFX9-GISEL-NEXT: global_load_dwordx4 v[6:9], v[0:1], off -; GFX9-GISEL-NEXT: ; kill: killed $vgpr0 killed $vgpr1 +; GFX9-GISEL-NEXT: global_load_dwordx4 v[4:7], v[0:1], off +; GFX9-GISEL-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_store_dwordx4 v[2:3], v[6:9], off -; GFX9-GISEL-NEXT: global_store_dwordx2 v[2:3], v[0:1], off offset:16 +; GFX9-GISEL-NEXT: global_store_dwordx4 v[2:3], v[4:7], off +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX9-GISEL-NEXT: global_store_dwordx2 v[2:3], v[8:9], off offset:16 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -9344,15 +9321,12 @@ define void @freeze_v3p1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 -; GFX10-GISEL-NEXT: global_load_dwordx4 v[6:9], v[0:1], off -; GFX10-GISEL-NEXT: ; kill: killed $vgpr0 killed $vgpr1 +; GFX10-GISEL-NEXT: global_load_dwordx4 v[4:7], v[0:1], off +; GFX10-GISEL-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[2:3], v[4:7], off ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: global_store_dwordx4 v[2:3], v[6:9], off -; GFX10-GISEL-NEXT: global_store_dwordx2 v[2:3], v[0:1], off offset:16 +; GFX10-GISEL-NEXT: global_store_dwordx2 v[2:3], v[8:9], off offset:16 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: freeze_v3p1: @@ -9371,14 +9345,12 @@ define void @freeze_v3p1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16 -; GFX11-GISEL-NEXT: global_load_b128 v[6:9], v[0:1], off +; GFX11-GISEL-NEXT: global_load_b128 v[4:7], v[0:1], off +; GFX11-GISEL-NEXT: global_load_b128 v[8:11], v[0:1], off offset:16 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-GISEL-NEXT: global_store_b128 v[2:3], v[4:7], off ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: global_store_b128 v[2:3], v[6:9], off -; GFX11-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off offset:16 +; GFX11-GISEL-NEXT: global_store_b64 v[2:3], v[8:9], off offset:16 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %a = load <3 x ptr addrspace(1)>, ptr addrspace(1) %ptra %freeze = freeze <3 x ptr addrspace(1)> %a diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll index ba81446a4bc09..20666560a7ec7 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -3182,12 +3182,12 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-LABEL: call_72xi32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s35, s33 +; GFX11-NEXT: s_mov_b32 s38, s33 ; GFX11-NEXT: s_add_i32 s33, s32, 0x1ff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s33, s33, 0xfffffe00 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v60, s33 offset:1600 ; 4-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v62, s33 offset:1600 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 @@ -3196,22 +3196,24 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_mov_b32 s3, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: s_mov_b32 s36, s34 +; GFX11-NEXT: s_mov_b32 s39, s34 ; GFX11-NEXT: s_mov_b32 s34, s32 ; GFX11-NEXT: s_addk_i32 s32, 0xa00 -; GFX11-NEXT: s_clause 0xb ; 48-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:40 -; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v45, s33 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v46, s33 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v47, s33 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v56, s33 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v57, s33 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v58, s33 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v59, s33 +; GFX11-NEXT: s_clause 0xd ; 56-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v45, s33 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v46, s33 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v47, s33 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v56, s33 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v57, s33 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v58, s33 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v59, s33 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v60, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v61, s33 ; GFX11-NEXT: s_add_i32 s0, s32, 0xa0 ; GFX11-NEXT: s_add_i32 s1, s32, 0x90 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 @@ -3232,7 +3234,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_add_i32 s0, s32, 32 ; GFX11-NEXT: s_add_i32 s1, s32, 16 ; GFX11-NEXT: s_add_i32 s2, s33, 0x200 -; GFX11-NEXT: v_writelane_b32 v60, s30, 0 +; GFX11-NEXT: v_writelane_b32 v62, s30, 0 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1 ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, 0 @@ -3253,126 +3255,106 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v30, 0 ; GFX11-NEXT: s_mov_b32 s1, return_72xi32@abs32@hi ; GFX11-NEXT: s_mov_b32 s0, return_72xi32@abs32@lo -; GFX11-NEXT: v_writelane_b32 v60, s31, 1 +; GFX11-NEXT: v_writelane_b32 v62, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b128 v[45:48], off, s33 offset:624 -; GFX11-NEXT: scratch_load_b128 v[33:36], off, s33 offset:640 +; GFX11-NEXT: s_clause 0xb +; GFX11-NEXT: scratch_load_b128 v[43:46], off, s33 offset:624 +; GFX11-NEXT: scratch_load_b128 v[47:50], off, s33 offset:640 +; GFX11-NEXT: scratch_load_b128 v[16:19], off, s33 offset:784 +; GFX11-NEXT: scratch_load_b128 v[12:15], off, s33 offset:768 +; GFX11-NEXT: scratch_load_b128 v[8:11], off, s33 offset:752 +; GFX11-NEXT: scratch_load_b128 v[4:7], off, s33 offset:736 +; GFX11-NEXT: scratch_load_b128 v[0:3], off, s33 offset:720 +; GFX11-NEXT: scratch_load_b128 v[24:27], off, s33 offset:656 +; GFX11-NEXT: scratch_load_b128 v[36:39], off, s33 offset:704 +; GFX11-NEXT: scratch_load_b128 v[32:35], off, s33 offset:688 +; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:672 +; GFX11-NEXT: scratch_load_b128 v[20:23], off, s33 offset:512 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: v_mov_b32_e32 v6, 24 ; GFX11-NEXT: s_add_i32 s2, s32, 0xa0 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_mov_b32_e32 v32, v48 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_load_b128 v[48:51], off, s33 offset:656 -; GFX11-NEXT: scratch_load_b128 v[52:55], off, s33 offset:672 -; GFX11-NEXT: scratch_load_b128 v[37:40], off, s33 offset:688 -; GFX11-NEXT: scratch_load_b128 v[41:44], off, s33 offset:704 -; GFX11-NEXT: scratch_load_b128 v[56:59], off, s33 offset:720 -; GFX11-NEXT: scratch_load_b128 v[12:15], off, s33 offset:736 -; GFX11-NEXT: scratch_load_b128 v[0:3], off, s33 offset:752 -; GFX11-NEXT: scratch_load_b128 v[4:7], off, s33 offset:768 -; GFX11-NEXT: scratch_load_b128 v[8:11], off, s33 offset:784 -; GFX11-NEXT: scratch_load_b128 v[16:19], off, s33 offset:512 -; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: s_add_i32 s3, s32, 0x90 +; GFX11-NEXT: s_add_i32 s35, s32, 0x80 +; GFX11-NEXT: s_add_i32 s36, s32, 0x70 +; GFX11-NEXT: s_add_i32 s37, s32, 0x6c +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v53, v26 +; GFX11-NEXT: v_dual_mov_b32 v41, v1 :: v_dual_mov_b32 v42, v2 +; GFX11-NEXT: v_mov_b32_e32 v51, v24 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[16:19], s33 offset:1584 ; 16-byte Folded Spill -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_b128 v[16:19], off, s33 offset:528 -; GFX11-NEXT: scratch_load_b128 v[20:23], off, s33 offset:544 -; GFX11-NEXT: scratch_load_b128 v[24:27], off, s33 offset:560 -; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:576 -; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v7, v10 -; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_mov_b32_e32 v10, v21 +; GFX11-NEXT: scratch_store_b128 off, v[20:23], s33 offset:1584 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_load_b128 v[20:23], off, s33 offset:528 +; GFX11-NEXT: v_mov_b32_e32 v52, v25 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1568 ; 16-byte Folded Spill -; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:592 +; GFX11-NEXT: scratch_store_b128 off, v[20:23], s33 offset:1536 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_load_b128 v[20:23], off, s33 offset:544 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1552 ; 16-byte Folded Spill -; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:608 +; GFX11-NEXT: scratch_store_b128 off, v[20:23], s33 offset:1568 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_load_b128 v[20:23], off, s33 offset:560 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1536 ; 16-byte Folded Spill -; GFX11-NEXT: scratch_store_b128 off, v[32:35], s32 -; GFX11-NEXT: v_mov_b32_e32 v32, v36 -; GFX11-NEXT: v_dual_mov_b32 v33, v48 :: v_dual_mov_b32 v34, v49 -; GFX11-NEXT: v_dual_mov_b32 v35, v50 :: v_dual_mov_b32 v48, v51 -; GFX11-NEXT: v_dual_mov_b32 v49, v52 :: v_dual_mov_b32 v50, v53 -; GFX11-NEXT: v_dual_mov_b32 v51, v54 :: v_dual_mov_b32 v36, v55 -; GFX11-NEXT: v_dual_mov_b32 v53, v41 :: v_dual_mov_b32 v52, v40 -; GFX11-NEXT: v_dual_mov_b32 v54, v42 :: v_dual_mov_b32 v41, v56 -; GFX11-NEXT: v_dual_mov_b32 v55, v43 :: v_dual_mov_b32 v40, v44 -; GFX11-NEXT: v_dual_mov_b32 v42, v57 :: v_dual_mov_b32 v57, v12 -; GFX11-NEXT: v_dual_mov_b32 v43, v58 :: v_dual_mov_b32 v56, v59 -; GFX11-NEXT: v_mov_b32_e32 v58, v13 -; GFX11-NEXT: v_dual_mov_b32 v12, v15 :: v_dual_mov_b32 v13, v0 -; GFX11-NEXT: v_dual_mov_b32 v15, v2 :: v_dual_mov_b32 v0, v3 -; GFX11-NEXT: v_dual_mov_b32 v2, v5 :: v_dual_mov_b32 v3, v6 -; GFX11-NEXT: v_dual_mov_b32 v5, v8 :: v_dual_mov_b32 v6, v9 -; GFX11-NEXT: v_mov_b32_e32 v9, v20 -; GFX11-NEXT: scratch_store_b32 off, v11, s2 -; GFX11-NEXT: s_add_i32 s2, s32, 0x90 -; GFX11-NEXT: v_mov_b32_e32 v11, v22 -; GFX11-NEXT: scratch_store_b128 off, v[4:7], s2 -; GFX11-NEXT: s_add_i32 s2, s32, 0x80 -; GFX11-NEXT: v_mov_b32_e32 v5, v16 -; GFX11-NEXT: scratch_store_b128 off, v[0:3], s2 -; GFX11-NEXT: v_mov_b32_e32 v0, 24 -; GFX11-NEXT: s_add_i32 s2, s32, 0x70 -; GFX11-NEXT: v_mov_b32_e32 v6, v17 -; GFX11-NEXT: scratch_store_b128 off, v[12:15], s2 -; GFX11-NEXT: s_add_i32 s2, s32, 0x6c -; GFX11-NEXT: v_mov_b32_e32 v7, v18 -; GFX11-NEXT: scratch_store_b32 off, v0, s2 +; GFX11-NEXT: scratch_store_b128 off, v[20:23], s33 offset:1552 ; 16-byte Folded Spill +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_load_b128 v[20:23], off, s33 offset:576 +; GFX11-NEXT: scratch_load_b128 v[58:61], off, s33 offset:592 +; GFX11-NEXT: scratch_load_b128 v[54:57], off, s33 offset:608 +; GFX11-NEXT: scratch_store_b128 off, v[46:49], s32 +; GFX11-NEXT: scratch_store_b32 off, v19, s2 +; GFX11-NEXT: scratch_store_b128 off, v[15:18], s3 +; GFX11-NEXT: scratch_store_b128 off, v[11:14], s35 +; GFX11-NEXT: scratch_store_b128 off, v[7:10], s36 ; GFX11-NEXT: s_add_i32 s2, s32, 0x60 -; GFX11-NEXT: v_dual_mov_b32 v8, v19 :: v_dual_mov_b32 v15, v26 -; GFX11-NEXT: scratch_store_b96 off, v[56:58], s2 +; GFX11-NEXT: scratch_store_b32 off, v6, s37 +; GFX11-NEXT: scratch_store_b96 off, v[3:5], s2 ; GFX11-NEXT: s_add_i32 s2, s32, 0x50 -; GFX11-NEXT: v_dual_mov_b32 v12, v23 :: v_dual_mov_b32 v29, v45 -; GFX11-NEXT: scratch_store_b128 off, v[40:43], s2 -; GFX11-NEXT: s_add_i32 s2, s32, 64 -; GFX11-NEXT: v_mov_b32_e32 v13, v24 -; GFX11-NEXT: scratch_store_b128 off, v[52:55], s2 -; GFX11-NEXT: s_add_i32 s2, s32, 48 -; GFX11-NEXT: v_mov_b32_e32 v14, v25 -; GFX11-NEXT: scratch_store_b128 off, v[36:39], s2 -; GFX11-NEXT: s_add_i32 s2, s32, 32 -; GFX11-NEXT: v_mov_b32_e32 v16, v27 -; GFX11-NEXT: scratch_store_b128 off, v[48:51], s2 +; GFX11-NEXT: s_add_i32 s3, s32, 64 +; GFX11-NEXT: s_add_i32 s35, s32, 48 +; GFX11-NEXT: s_add_i32 s36, s32, 32 +; GFX11-NEXT: scratch_store_b128 off, v[39:42], s2 +; GFX11-NEXT: scratch_store_b128 off, v[35:38], s3 +; GFX11-NEXT: scratch_store_b128 off, v[31:34], s35 +; GFX11-NEXT: scratch_store_b128 off, v[27:30], s36 ; GFX11-NEXT: s_add_i32 s2, s32, 16 -; GFX11-NEXT: v_mov_b32_e32 v30, v46 -; GFX11-NEXT: scratch_store_b128 off, v[32:35], s2 +; GFX11-NEXT: v_mov_b32_e32 v29, v43 +; GFX11-NEXT: scratch_store_b128 off, v[50:53], s2 ; GFX11-NEXT: s_clause 0x3 ; 64-byte Folded Reload -; GFX11-NEXT: scratch_load_b128 v[17:20], off, s33 offset:1568 -; GFX11-NEXT: scratch_load_b128 v[21:24], off, s33 offset:1552 -; GFX11-NEXT: scratch_load_b128 v[25:28], off, s33 offset:1536 ; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1584 +; GFX11-NEXT: scratch_load_b128 v[5:8], off, s33 offset:1536 +; GFX11-NEXT: scratch_load_b128 v[9:12], off, s33 offset:1568 +; GFX11-NEXT: scratch_load_b128 v[13:16], off, s33 offset:1552 ; GFX11-NEXT: s_add_i32 s2, s33, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v31, v47 :: v_dual_mov_b32 v0, s2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, 42 +; GFX11-NEXT: v_dual_mov_b32 v30, v44 :: v_dual_mov_b32 v31, v45 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 42 +; GFX11-NEXT: v_dual_mov_b32 v17, v20 :: v_dual_mov_b32 v18, v21 +; GFX11-NEXT: v_dual_mov_b32 v19, v22 :: v_dual_mov_b32 v20, v23 +; GFX11-NEXT: v_dual_mov_b32 v21, v58 :: v_dual_mov_b32 v22, v59 +; GFX11-NEXT: v_dual_mov_b32 v23, v60 :: v_dual_mov_b32 v24, v61 +; GFX11-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v55 +; GFX11-NEXT: v_dual_mov_b32 v27, v56 :: v_dual_mov_b32 v28, v57 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_clause 0xb ; 48-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v59, off, s33 -; GFX11-NEXT: scratch_load_b32 v58, off, s33 offset:4 -; GFX11-NEXT: scratch_load_b32 v57, off, s33 offset:8 -; GFX11-NEXT: scratch_load_b32 v56, off, s33 offset:12 -; GFX11-NEXT: scratch_load_b32 v47, off, s33 offset:16 -; GFX11-NEXT: scratch_load_b32 v46, off, s33 offset:20 -; GFX11-NEXT: scratch_load_b32 v45, off, s33 offset:24 -; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:28 -; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:32 -; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:36 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:40 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:44 -; GFX11-NEXT: v_readlane_b32 s31, v60, 1 -; GFX11-NEXT: v_readlane_b32 s30, v60, 0 +; GFX11-NEXT: s_clause 0xd ; 56-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v61, off, s33 +; GFX11-NEXT: scratch_load_b32 v60, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v59, off, s33 offset:8 +; GFX11-NEXT: scratch_load_b32 v58, off, s33 offset:12 +; GFX11-NEXT: scratch_load_b32 v57, off, s33 offset:16 +; GFX11-NEXT: scratch_load_b32 v56, off, s33 offset:20 +; GFX11-NEXT: scratch_load_b32 v47, off, s33 offset:24 +; GFX11-NEXT: scratch_load_b32 v46, off, s33 offset:28 +; GFX11-NEXT: scratch_load_b32 v45, off, s33 offset:32 +; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:36 +; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:40 +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:44 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:48 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:52 +; GFX11-NEXT: v_readlane_b32 s31, v62, 1 +; GFX11-NEXT: v_readlane_b32 s30, v62, 0 ; GFX11-NEXT: s_mov_b32 s32, s34 -; GFX11-NEXT: s_mov_b32 s34, s36 +; GFX11-NEXT: s_mov_b32 s34, s39 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v60, off, s33 offset:1600 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v62, off, s33 offset:1600 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_mov_b32 s33, s35 +; GFX11-NEXT: s_mov_b32 s33, s38 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll index 347fddbedb0a7..25996ee11c5a1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll @@ -1003,10 +1003,10 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v28, s2 ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13] +; GISEL-NEXT: v_mov_b32_e32 v28, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 @@ -2000,10 +2000,10 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace( ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v28, s2 ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13] +; GISEL-NEXT: v_mov_b32_e32 v28, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 @@ -2349,10 +2349,10 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace( ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v28, s2 ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13] +; GISEL-NEXT: v_mov_b32_e32 v28, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 @@ -2698,10 +2698,10 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace( ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v28, s2 ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13] +; GISEL-NEXT: v_mov_b32_e32 v28, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 @@ -3047,10 +3047,10 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace( ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v28, s2 ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13] +; GISEL-NEXT: v_mov_b32_e32 v28, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll index a10c861601c2c..192b4983ed7f8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -3011,17 +3011,17 @@ define void @test_writelane_v3i64(ptr addrspace(1) %out, <3 x i64> %src, i32 %sr ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX802-GISEL-NEXT: v_add_u32_e32 v17, vcc, 16, v0 -; GFX802-GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v1, vcc ; GFX802-GISEL-NEXT: flat_load_dwordx4 v[9:12], v[0:1] +; GFX802-GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v1, vcc ; GFX802-GISEL-NEXT: flat_load_dwordx4 v[13:16], v[17:18] ; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v8 ; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2 ; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3 ; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v4 ; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v5 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s5 ; GFX802-GISEL-NEXT: v_readfirstlane_b32 s9, v6 ; GFX802-GISEL-NEXT: v_readfirstlane_b32 s10, v7 -; GFX802-GISEL-NEXT: s_mov_b32 m0, s5 ; GFX802-GISEL-NEXT: s_waitcnt vmcnt(1) ; GFX802-GISEL-NEXT: v_writelane_b32 v9, s4, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v10, s6, m0 @@ -3030,10 +3030,8 @@ define void @test_writelane_v3i64(ptr addrspace(1) %out, <3 x i64> %src, i32 %sr ; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX802-GISEL-NEXT: v_writelane_b32 v13, s9, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v14, s10, m0 -; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, v13 -; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, v14 ; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[9:12] -; GFX802-GISEL-NEXT: flat_store_dwordx2 v[17:18], v[2:3] +; GFX802-GISEL-NEXT: flat_store_dwordx2 v[17:18], v[13:14] ; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -3043,25 +3041,23 @@ define void @test_writelane_v3i64(ptr addrspace(1) %out, <3 x i64> %src, i32 %sr ; GFX1010-GISEL-NEXT: s_clause 0x1 ; GFX1010-GISEL-NEXT: global_load_dwordx4 v[9:12], v[0:1], off ; GFX1010-GISEL-NEXT: global_load_dwordx4 v[13:16], v[0:1], off offset:16 -; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v8 -; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s7, v6 -; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s8, v7 ; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v8 ; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3 -; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s9, v4 -; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s10, v5 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s7, v4 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s8, v5 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s9, v6 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s10, v7 ; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(1) ; GFX1010-GISEL-NEXT: v_writelane_b32 v9, s4, s5 -; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX1010-GISEL-NEXT: v_writelane_b32 v13, s7, s5 -; GFX1010-GISEL-NEXT: v_writelane_b32 v14, s8, s5 ; GFX1010-GISEL-NEXT: v_writelane_b32 v10, s6, s5 -; GFX1010-GISEL-NEXT: v_writelane_b32 v11, s9, s5 -; GFX1010-GISEL-NEXT: v_writelane_b32 v12, s10, s5 -; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, v13 -; GFX1010-GISEL-NEXT: v_mov_b32_e32 v3, v14 +; GFX1010-GISEL-NEXT: v_writelane_b32 v11, s7, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v12, s8, s5 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-GISEL-NEXT: v_writelane_b32 v13, s9, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v14, s10, s5 ; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[9:12], off -; GFX1010-GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off offset:16 +; GFX1010-GISEL-NEXT: global_store_dwordx2 v[0:1], v[13:14], off offset:16 ; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-GISEL-LABEL: test_writelane_v3i64: @@ -3070,26 +3066,24 @@ define void @test_writelane_v3i64(ptr addrspace(1) %out, <3 x i64> %src, i32 %sr ; GFX1100-GISEL-NEXT: s_clause 0x1 ; GFX1100-GISEL-NEXT: global_load_b128 v[9:12], v[0:1], off ; GFX1100-GISEL-NEXT: global_load_b128 v[13:16], v[0:1], off offset:16 -; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v8 -; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s3, v6 -; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s4, v7 ; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v8 ; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3 -; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s5, v4 -; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s6, v5 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s4, v5 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s5, v6 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s6, v7 ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(1) ; GFX1100-GISEL-NEXT: v_writelane_b32 v9, s0, s1 -; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX1100-GISEL-NEXT: v_writelane_b32 v13, s3, s1 -; GFX1100-GISEL-NEXT: v_writelane_b32 v14, s4, s1 ; GFX1100-GISEL-NEXT: v_writelane_b32 v10, s2, s1 -; GFX1100-GISEL-NEXT: v_writelane_b32 v11, s5, s1 -; GFX1100-GISEL-NEXT: v_writelane_b32 v12, s6, s1 -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, v13 -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v3, v14 +; GFX1100-GISEL-NEXT: v_writelane_b32 v11, s3, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v12, s4, s1 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: v_writelane_b32 v13, s5, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v14, s6, s1 ; GFX1100-GISEL-NEXT: s_clause 0x1 ; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[9:12], off -; GFX1100-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off offset:16 +; GFX1100-GISEL-NEXT: global_store_b64 v[0:1], v[13:14], off offset:16 ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] %oldval = load <3 x i64>, ptr addrspace(1) %out %writelane = call <3 x i64> @llvm.amdgcn.writelane.v2i64(<3 x i64> %src, i32 %src1, <3 x i64> %oldval) diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index d23c49165ec70..388006281abdc 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -6359,7 +6359,6 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v3 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 16 @@ -6369,7 +6368,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v1, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 31, v3 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 16, v3 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v5, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v3, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v7, 0, 16 @@ -6404,32 +6403,31 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v8, v3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v3, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1 +; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3 ; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-HSA-NEXT: v_bfe_i32 v14, v9, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -6448,12 +6446,11 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v3 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v3, 0, 16 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v11, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v3, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v0, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16 @@ -6977,27 +6974,25 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v7 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v4 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 31, v5 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 16, v5 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v5, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 31, v7 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 16, v7 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v6, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v12, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v6, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 31, v5 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 16, v5 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v5, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v7 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 16, v7 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v7, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v0, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v18, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v17, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v13, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v9, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v11, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v10, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v9, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v3 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v3 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v3, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 31, v1 @@ -7008,22 +7003,22 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v2, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v1, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -7067,57 +7062,55 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v9, v18, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v7 +; GCN-HSA-NEXT: v_bfe_i32 v7, v7, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[7:10] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; GCN-HSA-NEXT: v_bfe_i32 v7, v5, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[7:10] ; GCN-HSA-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v9, v11, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GCN-HSA-NEXT: v_bfe_i32 v6, v19, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v6, v14, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[7:10] ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_mov_b32_e32 v15, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1 +; GCN-HSA-NEXT: v_bfe_i32 v4, v3, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v3 ; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v2, v18, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_bfe_i32 v2, v15, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v14, v14, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v15, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[8:11] ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[12:15] ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -7132,62 +7125,60 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[1:4], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[5:8], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v0, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v3, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v1, v6, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v4, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v11, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v13, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v7, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v10, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v11, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v12, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v14, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 31, v27 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v22, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v7, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v20, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 31, v27 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v30, 31, v29 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 31, v25 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v16i16_to_v16i64: @@ -8116,19 +8107,17 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v15 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v3 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v7 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v23, 0, 16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v15 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v15 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v15, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) @@ -8137,11 +8126,10 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v13, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v11 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v27, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v3 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v3 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v3, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) @@ -8150,35 +8138,34 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v1, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v22, 0, 16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v7 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v7 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v7 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v7 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v7, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v5 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v5 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v5, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v5 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v5 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v5, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v11 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v11 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v13, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v11 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v11 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v11, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v12, 0, 16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v14, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v14, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v21, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v20, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v20, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v22, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v8, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v8, 31, v9 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v9 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v9, 0, 16 @@ -8237,8 +8224,8 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 @@ -8246,161 +8233,157 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_bfe_i32 v16, v13, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v13 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v13 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v15 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v15 +; GCN-HSA-NEXT: v_bfe_i32 v15, v15, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[15:18] +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 +; GCN-HSA-NEXT: v_bfe_i32 v15, v13, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCN-HSA-NEXT: v_bfe_i32 v18, v13, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v16, v14, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v13 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v13 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v14 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, v15 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[15:18] +; GCN-HSA-NEXT: s_waitcnt vmcnt(4) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GCN-HSA-NEXT: v_bfe_i32 v15, v13, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v13, v14, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: v_bfe_i32 v13, v13, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[13:16] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v15 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[13:16] -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 ; GCN-HSA-NEXT: v_bfe_i32 v12, v12, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v14, v14, 0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_waitcnt vmcnt(6) -; GCN-HSA-NEXT: v_bfe_i32 v12, v9, 0, 16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v28, 16, v10 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, v11 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GCN-HSA-NEXT: v_bfe_i32 v16, v29, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v12, v8, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v14, v18, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v10, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v10, v28, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[12:15] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v11 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v11 +; GCN-HSA-NEXT: v_bfe_i32 v11, v11, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[11:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2 +; GCN-HSA-NEXT: v_bfe_i32 v11, v9, 0, 16 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v11 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v11 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v9 +; GCN-HSA-NEXT: v_bfe_i32 v15, v10, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GCN-HSA-NEXT: v_bfe_i32 v10, v28, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v17, v27, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GCN-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[11:14] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v17 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] -; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v10, v20, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v14, v19, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[15:18] +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[8:11] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, v3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[12:15] +; GCN-HSA-NEXT: v_bfe_i32 v10, v3, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_bfe_i32 v8, v21, 0, 16 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; GCN-HSA-NEXT: v_bfe_i32 v12, v4, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v16, v6, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[8:11] -; GCN-HSA-NEXT: v_bfe_i32 v6, v5, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v1 +; GCN-HSA-NEXT: v_bfe_i32 v20, v1, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v24, v0, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v26, v26, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 +; GCN-HSA-NEXT: v_bfe_i32 v26, v27, 0, 16 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GCN-HSA-NEXT: v_bfe_i32 v14, v27, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v26 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[24:27] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_bfe_i32 v20, v1, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v6 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[20:23] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, v7 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; GCN-HSA-NEXT: v_bfe_i32 v16, v6, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GCN-HSA-NEXT: v_bfe_i32 v18, v18, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v19, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[10:13] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_bfe_i32 v10, v19, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_bfe_i32 v2, v7, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v2, 16, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_bfe_i32 v8, v4, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v5 +; GCN-HSA-NEXT: v_bfe_i32 v12, v5, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 16, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[6:9] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i64: @@ -8413,115 +8396,110 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[5:8], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[13:16], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[9:12], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[1:4], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[15:18], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[3:6], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v6, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v15, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v15, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, v16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v16, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v13 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v13, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v17, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v18, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2) +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v4, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v15, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v16, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v13, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:160 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v14, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v15, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v2, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(4) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v13, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v9, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v11, 0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v2, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v12 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v19, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v12, 0, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, v8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v8, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v10 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v7, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v7, v10, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(8) -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v7, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v12 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v12, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v7, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v10, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v17, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v18, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v16, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v12, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v7, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 31, v27 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v30, 31, v29 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v4, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v6, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v19, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v7, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v20, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 31, v25 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 31, v26 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v29, 31, v28 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll index b4c0b7497b95f..04d906ca6ad9c 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll @@ -6137,9 +6137,8 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out, ; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; SI-NEXT: v_mov_b32_e32 v16, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; SI-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; SI-NEXT: v_ashrrev_i32_e32 v4, 16, v1 ; SI-NEXT: v_ashrrev_i32_e32 v7, 31, v3 @@ -6147,9 +6146,9 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out, ; SI-NEXT: v_bfe_i32 v0, v0, 0, 16 ; SI-NEXT: v_bfe_i32 v8, v1, 0, 16 ; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 -; SI-NEXT: v_bfe_i32 v10, v9, 0, 16 +; SI-NEXT: v_bfe_i32 v10, v3, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SI-NEXT: v_bfe_i32 v12, v12, 0, 16 +; SI-NEXT: v_bfe_i32 v12, v9, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; SI-NEXT: v_bfe_i32 v14, v11, 0, 16 @@ -6171,14 +6170,13 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out, ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, s0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; VI-NO-DS128-NEXT: v_bfe_i32 v14, v2, 0, 16 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v2, v3 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; VI-NO-DS128-NEXT: v_bfe_i32 v10, v10, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v14, v2, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v2, v3, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v6, v5, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v8, v7, 0, 16 @@ -6206,14 +6204,13 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out, ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v16, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v2, 0, 16 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v2, v3 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v9, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v2, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v3, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v5, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v7, 0, 16 @@ -6331,14 +6328,13 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out, ; VI-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-DS128-NEXT: v_bfe_i32 v6, v0, 0, 16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; VI-DS128-NEXT: v_bfe_i32 v12, v2, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16 -; VI-DS128-NEXT: v_mov_b32_e32 v0, v3 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; VI-DS128-NEXT: v_bfe_i32 v8, v1, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v0, v3, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v10, v1, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -6365,15 +6361,14 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out, ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-DS128-NEXT: v_bfe_i32 v12, v2, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v6, v0, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX9-DS128-NEXT: v_bfe_i32 v12, v2, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16 -; GFX9-DS128-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX9-DS128-NEXT: v_bfe_i32 v8, v1, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v0, v3, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v10, v1, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -6813,18 +6808,16 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; SI-NEXT: ds_read2_b64 v[4:7], v4 offset1:1 ; SI-NEXT: v_mov_b32_e32 v18, s0 ; SI-NEXT: s_waitcnt lgkmcnt(1) -; SI-NEXT: v_mov_b32_e32 v12, v3 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v14, v7 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 ; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v5 ; SI-NEXT: v_ashrrev_i32_e32 v8, 16, v5 ; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v3 ; SI-NEXT: v_ashrrev_i32_e32 v10, 16, v3 -; SI-NEXT: v_bfe_i32 v12, v12, 0, 16 +; SI-NEXT: v_bfe_i32 v12, v3, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; SI-NEXT: ds_write2_b64 v18, v[12:13], v[10:11] offset0:14 offset1:15 ; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v1 @@ -6834,7 +6827,7 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; SI-NEXT: ds_write2_b64 v18, v[12:13], v[10:11] offset0:10 offset1:11 ; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v7 ; SI-NEXT: v_ashrrev_i32_e32 v10, 16, v7 -; SI-NEXT: v_bfe_i32 v12, v14, 0, 16 +; SI-NEXT: v_bfe_i32 v12, v7, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; SI-NEXT: ds_write2_b64 v18, v[12:13], v[10:11] offset0:6 offset1:7 ; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 @@ -6842,10 +6835,10 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; SI-NEXT: v_bfe_i32 v5, v6, 0, 16 ; SI-NEXT: v_bfe_i32 v10, v0, 0, 16 ; SI-NEXT: v_bfe_i32 v7, v2, 0, 16 -; SI-NEXT: v_bfe_i32 v12, v19, 0, 16 +; SI-NEXT: v_bfe_i32 v12, v17, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; SI-NEXT: v_bfe_i32 v14, v17, 0, 16 +; SI-NEXT: v_bfe_i32 v14, v14, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; SI-NEXT: v_bfe_i32 v16, v16, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 @@ -6891,38 +6884,36 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: v_bfe_i32 v14, v6, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v7 ; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:12 offset1:13 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, v7 -; VI-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v4, v18, 0, 16 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v18, v3 +; VI-NO-DS128-NEXT: v_bfe_i32 v14, v6, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v6, v7, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[14:15] offset0:14 offset1:15 -; VI-NO-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v4, v18, 0, 16 +; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[6:7], v[14:15] offset0:14 offset1:15 +; VI-NO-DS128-NEXT: v_bfe_i32 v6, v2, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v2, v3, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v8, v8, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v10, v9, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v12, v11, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; VI-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; VI-NO-DS128-NEXT: v_bfe_i32 v16, v0, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v0, v1, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:6 offset1:7 -; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[2:3], v[12:13] offset0:4 offset1:5 -; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[6:7], v[10:11] offset0:2 offset1:3 -; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[0:1], v[8:9] offset1:1 +; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[2:3], v[4:5] offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[6:7], v[12:13] offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[0:1], v[10:11] offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[8:9] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_sextload_v16i16_to_v16i64: @@ -6953,38 +6944,36 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v6, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v7 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:12 offset1:13 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v16, v7 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v18, 0, 16 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v18, v3 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v6, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v7, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[14:15] offset0:14 offset1:15 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v18, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[6:7], v[14:15] offset0:14 offset1:15 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v2, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v3, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v8, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v9, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v11, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v0, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v1, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[2:3], v[12:13] offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[6:7], v[10:11] offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[0:1], v[8:9] offset1:1 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[2:3], v[4:5] offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[6:7], v[12:13] offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[0:1], v[10:11] offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[8:9] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v16i16_to_v16i64: @@ -7167,11 +7156,12 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; VI-DS128-NEXT: ds_read_b128 v[3:6], v0 ; VI-DS128-NEXT: ds_read_b128 v[7:10], v0 offset:16 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-DS128-NEXT: v_mov_b32_e32 v18, v6 +; VI-DS128-NEXT: v_bfe_i32 v0, v3, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) +; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v8 ; VI-DS128-NEXT: v_bfe_i32 v11, v8, 0, 16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-DS128-NEXT: v_bfe_i32 v13, v8, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v13, v3, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; VI-DS128-NEXT: v_mov_b32_e32 v8, s0 @@ -7179,44 +7169,41 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; VI-DS128-NEXT: v_bfe_i32 v11, v7, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; VI-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16 -; VI-DS128-NEXT: v_mov_b32_e32 v15, v10 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v10 ; VI-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:64 -; VI-DS128-NEXT: v_bfe_i32 v11, v15, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v11, v10, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; VI-DS128-NEXT: v_bfe_i32 v10, v4, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; VI-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:112 +; VI-DS128-NEXT: v_bfe_i32 v12, v4, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v9 ; VI-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; VI-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; VI-DS128-NEXT: v_bfe_i32 v10, v4, 0, 16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; VI-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:96 -; VI-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v16, v19, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v0, v3, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v12, v4, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v4, v6, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v6, v7, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v14, v5, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; VI-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:48 -; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:32 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:48 +; VI-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:32 ; VI-DS128-NEXT: ds_write_b128 v8, v[10:13] offset:16 ; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] ; VI-DS128-NEXT: s_endpgm @@ -7242,43 +7229,41 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_bfe_i32 v11, v7, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX9-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16 -; GFX9-DS128-NEXT: v_mov_b32_e32 v15, v10 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v10 ; GFX9-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:64 -; GFX9-DS128-NEXT: v_bfe_i32 v11, v15, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v11, v10, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; GFX9-DS128-NEXT: v_bfe_i32 v10, v4, 0, 16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX9-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:112 +; GFX9-DS128-NEXT: v_bfe_i32 v12, v4, 0, 16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v9 ; GFX9-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 -; GFX9-DS128-NEXT: v_mov_b32_e32 v18, v6 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; GFX9-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-DS128-NEXT: v_bfe_i32 v10, v4, 0, 16 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; GFX9-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:96 -; GFX9-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v16, v19, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v12, v4, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v4, v6, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v6, v7, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v14, v5, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:48 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:32 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX9-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:48 +; GFX9-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:32 ; GFX9-DS128-NEXT: ds_write_b128 v8, v[10:13] offset:16 ; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3] ; GFX9-DS128-NEXT: s_endpgm @@ -8032,10 +8017,9 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; SI-NEXT: ds_read2_b64 v[8:11], v12 offset0:6 offset1:7 ; SI-NEXT: ds_read2_b64 v[12:15], v12 offset0:4 offset1:5 ; SI-NEXT: s_waitcnt lgkmcnt(3) -; SI-NEXT: v_mov_b32_e32 v18, v7 ; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v7 ; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v7 -; SI-NEXT: v_bfe_i32 v18, v18, 0, 16 +; SI-NEXT: v_bfe_i32 v18, v7, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; SI-NEXT: v_mov_b32_e32 v7, s0 ; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:14 offset1:15 @@ -8045,10 +8029,9 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:10 offset1:11 ; SI-NEXT: s_waitcnt lgkmcnt(4) -; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v3 ; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v3 -; SI-NEXT: v_bfe_i32 v18, v5, 0, 16 +; SI-NEXT: v_bfe_i32 v18, v3, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:6 offset1:7 ; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v1 @@ -8057,10 +8040,9 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:2 offset1:3 ; SI-NEXT: s_waitcnt lgkmcnt(5) -; SI-NEXT: v_mov_b32_e32 v1, v11 ; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v11 ; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v11 -; SI-NEXT: v_bfe_i32 v18, v1, 0, 16 +; SI-NEXT: v_bfe_i32 v18, v11, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:30 offset1:31 ; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v9 @@ -8069,12 +8051,11 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:26 offset1:27 ; SI-NEXT: s_waitcnt lgkmcnt(6) -; SI-NEXT: v_mov_b32_e32 v1, v15 -; SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v15 -; SI-NEXT: v_bfe_i32 v17, v1, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; SI-NEXT: ds_write2_b64 v7, v[17:18], v[15:16] offset0:22 offset1:23 +; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v15 +; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v15 +; SI-NEXT: v_bfe_i32 v18, v15, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:22 offset1:23 ; SI-NEXT: v_ashrrev_i32_e32 v16, 31, v13 ; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v13 ; SI-NEXT: v_bfe_i32 v17, v13, 0, 16 @@ -8135,19 +8116,19 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_mov_b32_e32 v7, s1 -; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v7 offset0:6 offset1:7 -; VI-NO-DS128-NEXT: ds_read2_b64 v[12:15], v7 offset0:4 offset1:5 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 +; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_read2_b64 v[12:15], v4 offset0:4 offset1:5 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v11, s0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; VI-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v18, v3, 0, 16 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; VI-NO-DS128-NEXT: v_bfe_i32 v16, v3, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v18, v5, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; VI-NO-DS128-NEXT: ds_read2_b64 v[3:6], v7 offset0:2 offset1:3 -; VI-NO-DS128-NEXT: ds_read2_b64 v[7:10], v7 offset1:1 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[18:19], v[16:17] offset0:30 offset1:31 +; VI-NO-DS128-NEXT: ds_read2_b64 v[7:10], v4 offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_read2_b64 v[3:6], v4 offset1:1 +; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[16:17], v[18:19] offset0:30 offset1:31 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; VI-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v18, v2, 0, 16 @@ -8179,87 +8160,86 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(5) +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v4 ; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[15:16] offset0:20 offset1:21 ; VI-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v16, v13, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; VI-NO-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[16:17], v[14:15] offset0:18 offset1:19 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; VI-NO-DS128-NEXT: v_bfe_i32 v15, v12, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v17, v16, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; VI-NO-DS128-NEXT: v_bfe_i32 v17, v12, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v19, v0, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[15:16], v[17:18] offset0:16 offset1:17 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(9) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v6 -; VI-NO-DS128-NEXT: v_bfe_i32 v15, v15, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v17, v6, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[19:20] offset0:16 offset1:17 +; VI-NO-DS128-NEXT: v_bfe_i32 v17, v0, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v19, v10, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[15:16] offset0:14 offset1:15 -; VI-NO-DS128-NEXT: v_bfe_i32 v15, v6, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v5, v5, 0, 16 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(9) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[5:6], v[15:16] offset0:12 offset1:13 -; VI-NO-DS128-NEXT: v_bfe_i32 v5, v12, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v15, v4, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[15:16], v[5:6] offset0:10 offset1:11 -; VI-NO-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; VI-NO-DS128-NEXT: v_bfe_i32 v17, v3, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v21, v0, 0, 16 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[19:20], v[17:18] offset0:14 offset1:15 +; VI-NO-DS128-NEXT: v_bfe_i32 v17, v0, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v9, v9, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; VI-NO-DS128-NEXT: v_bfe_i32 v19, v19, 0, 16 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[21:22] offset0:8 offset1:9 -; VI-NO-DS128-NEXT: v_bfe_i32 v17, v10, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v1, v1, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v4, v7, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v6, v8, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v8, v9, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[9:10], v[17:18] offset0:12 offset1:13 +; VI-NO-DS128-NEXT: v_bfe_i32 v9, v0, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v17, v8, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[9:10] offset0:10 offset1:11 +; VI-NO-DS128-NEXT: v_bfe_i32 v17, v0, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v19, v7, 0, 16 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v6 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; VI-NO-DS128-NEXT: v_bfe_i32 v8, v12, 0, 16 +; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[19:20], v[17:18] offset0:8 offset1:9 +; VI-NO-DS128-NEXT: v_bfe_i32 v17, v5, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v5, v6, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v1, v1, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; VI-NO-DS128-NEXT: v_bfe_i32 v21, v3, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v3, v4, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[19:20] offset0:6 offset1:7 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[8:9], v[15:16] offset0:4 offset1:5 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[6:7], v[13:14] offset0:2 offset1:3 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[4:5], v[1:2] offset1:1 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[5:6], v[8:9] offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[15:16] offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[3:4], v[13:14] offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[21:22], v[1:2] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_sextload_v32i16_to_v32i64: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, s1 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v8 offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v8 offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v11, s1 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v11 offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v11 offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v15, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v9, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v7, 0, 16 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v7, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v8, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[11:14], v8 offset1:1 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[7:10], v8 offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[18:19], v[16:17] offset0:30 offset1:31 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[7:10], v11 offset1:1 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[11:14], v11 offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[18:19] offset0:30 offset1:31 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v6 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v6, 0, 16 @@ -8296,63 +8276,62 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v1, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[2:3] offset0:18 offset1:19 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v0, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v0, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v17, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[3:4], v[16:17] offset0:16 offset1:17 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[18:19] offset0:16 offset1:17 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(8) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v10, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v14 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v14, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[3:4] offset0:14 offset1:15 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v9 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v9, v9, 0, 16 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[9:10], v[3:4] offset0:12 offset1:13 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v0, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v8, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v12 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[8:9], v[3:4] offset0:10 offset1:11 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v1, v20, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v7, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v20, v4, 0, 16 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, v14 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[18:19], v[16:17] offset0:14 offset1:15 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v14, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v13, v13, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v11 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v12, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v18, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v19, 0, 16 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[20:21] offset0:8 offset1:9 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[13:14], v[16:17] offset0:12 offset1:13 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v13, v13, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v12, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v0, 0, 16 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[13:14] offset0:10 offset1:11 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v0, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v11, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v13, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v11, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v8 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[18:19], v[16:17] offset0:8 offset1:9 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v9, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v9, v10, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v5, v5, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v1, v20, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v20, v7, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v7, v8, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[18:19] offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[10:11], v[12:13] offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[8:9], v[1:2] offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[3:4], v[5:6] offset1:1 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[9:10], v[12:13] offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[3:4] offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[7:8], v[1:2] offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[20:21], v[5:6] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v32i16_to_v32i64: @@ -8715,11 +8694,10 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; VI-DS128-NEXT: v_bfe_i32 v15, v2, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; VI-DS128-NEXT: v_mov_b32_e32 v2, v3 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:224 -; VI-DS128-NEXT: v_bfe_i32 v13, v2, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v15, v3, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v15, v2, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v13, v3, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -8736,44 +8714,38 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v11 ; VI-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:192 -; VI-DS128-NEXT: v_mov_b32_e32 v13, v12 ; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:160 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v12 -; VI-DS128-NEXT: v_bfe_i32 v0, v13, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v0, v12, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v10 ; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:176 -; VI-DS128-NEXT: v_bfe_i32 v0, v9, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v9 -; VI-DS128-NEXT: v_bfe_i32 v9, v10, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v11, v11, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; VI-DS128-NEXT: v_bfe_i32 v0, v9, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:144 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(8) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v19 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-DS128-NEXT: v_bfe_i32 v9, v19, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v11, v11, 0, 16 ; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:128 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(8) +; VI-DS128-NEXT: s_waitcnt lgkmcnt(6) ; VI-DS128-NEXT: v_bfe_i32 v0, v5, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; VI-DS128-NEXT: v_mov_b32_e32 v5, v20 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; VI-DS128-NEXT: v_bfe_i32 v9, v10, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v11, v5, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v19 +; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:144 +; VI-DS128-NEXT: v_bfe_i32 v9, v19, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v11, v5, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:96 -; VI-DS128-NEXT: v_bfe_i32 v9, v5, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v20 +; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:96 +; VI-DS128-NEXT: v_bfe_i32 v9, v20, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v11, v5, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 @@ -8783,18 +8755,20 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; VI-DS128-NEXT: v_bfe_i32 v11, v5, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v18 ; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:64 ; VI-DS128-NEXT: v_bfe_i32 v9, v4, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-DS128-NEXT: v_bfe_i32 v13, v18, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v15, v5, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; VI-DS128-NEXT: v_bfe_i32 v11, v4, 0, 16 -; VI-DS128-NEXT: v_mov_b32_e32 v4, v7 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v18 +; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:192 +; VI-DS128-NEXT: v_bfe_i32 v13, v18, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v15, v4, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:80 -; VI-DS128-NEXT: v_bfe_i32 v13, v4, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v7 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; VI-DS128-NEXT: v_bfe_i32 v15, v4, 0, 16 @@ -8831,11 +8805,10 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_bfe_i32 v16, v6, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-DS128-NEXT: v_mov_b32_e32 v6, v7 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v7 ; GFX9-DS128-NEXT: ds_write_b128 v12, v[14:17] offset:224 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v6, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v15, v7, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v15, v6, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v4 @@ -8851,44 +8824,43 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(5) ; GFX9-DS128-NEXT: v_bfe_i32 v4, v2, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GFX9-DS128-NEXT: v_bfe_i32 v6, v2, 0, 16 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:192 -; GFX9-DS128-NEXT: v_mov_b32_e32 v13, v3 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX9-DS128-NEXT: ds_write_b128 v12, v[4:7] offset:160 -; GFX9-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v6, v2, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[4:7] offset:176 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v0, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v1, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v15, v6, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v12, v[4:7] offset:176 ; GFX9-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(6) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v20 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(6) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:144 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v20, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v15, v1, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GFX9-DS128-NEXT: ds_write_b128 v12, v[2:5] offset:128 ; GFX9-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16 -; GFX9-DS128-NEXT: v_mov_b32_e32 v0, v21 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:192 +; GFX9-DS128-NEXT: v_bfe_i32 v13, v1, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(7) +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:144 +; GFX9-DS128-NEXT: v_bfe_i32 v13, v20, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:96 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:96 +; GFX9-DS128-NEXT: v_bfe_i32 v13, v21, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 @@ -8896,26 +8868,25 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:112 ; GFX9-DS128-NEXT: v_bfe_i32 v13, v18, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:64 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v19, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v15, v1, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v6, v8, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v8, v0, 0, 16 -; GFX9-DS128-NEXT: v_mov_b32_e32 v0, v11 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:64 +; GFX9-DS128-NEXT: v_bfe_i32 v13, v19, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:80 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:80 ; GFX9-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v10 ; GFX9-DS128-NEXT: v_bfe_i32 v17, v10, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v19, v0, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v9, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v13, v11, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 diff --git a/llvm/test/CodeGen/AMDGPU/merge-stores.ll b/llvm/test/CodeGen/AMDGPU/merge-stores.ll index 2e9d1b4c8f7e5..7457509ffe193 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-stores.ll +++ b/llvm/test/CodeGen/AMDGPU/merge-stores.ll @@ -1054,18 +1054,17 @@ define amdgpu_kernel void @merge_global_store_8_constants_i32(ptr addrspace(1) % ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, 34 -; SI-NEXT: v_mov_b32_e32 v1, 0x3e7 -; SI-NEXT: v_mov_b32_e32 v2, 0x41 -; SI-NEXT: v_mov_b32_e32 v3, 33 +; SI-NEXT: v_mov_b32_e32 v2, 34 +; SI-NEXT: v_mov_b32_e32 v3, 0x3e7 +; SI-NEXT: v_mov_b32_e32 v4, 0x41 +; SI-NEXT: v_mov_b32_e32 v5, 33 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, 0x62 +; SI-NEXT: v_mov_b32_e32 v1, 0x5b ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0x62 -; SI-NEXT: v_mov_b32_e32 v3, 0x5b -; SI-NEXT: v_mov_b32_e32 v4, 0xd4 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 +; SI-NEXT: v_mov_b32_e32 v2, 0xd4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; SI-NEXT: s_endpgm ; ; CI-LABEL: merge_global_store_8_constants_i32: @@ -1073,17 +1072,16 @@ define amdgpu_kernel void @merge_global_store_8_constants_i32(ptr addrspace(1) % ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: v_mov_b32_e32 v0, 34 -; CI-NEXT: v_mov_b32_e32 v1, 0x3e7 -; CI-NEXT: v_mov_b32_e32 v2, 0x41 -; CI-NEXT: v_mov_b32_e32 v3, 33 +; CI-NEXT: v_mov_b32_e32 v2, 34 +; CI-NEXT: v_mov_b32_e32 v3, 0x3e7 +; CI-NEXT: v_mov_b32_e32 v4, 0x41 +; CI-NEXT: v_mov_b32_e32 v5, 33 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; CI-NEXT: v_mov_b32_e32 v4, 0xd4 -; CI-NEXT: v_mov_b32_e32 v2, 0x62 -; CI-NEXT: v_mov_b32_e32 v3, 0x5b -; CI-NEXT: v_mov_b32_e32 v5, v1 -; CI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 +; CI-NEXT: v_mov_b32_e32 v0, 0x62 +; CI-NEXT: v_mov_b32_e32 v1, 0x5b +; CI-NEXT: v_mov_b32_e32 v2, 0xd4 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; CI-NEXT: s_endpgm store i32 34, ptr addrspace(1) %out, align 4 %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1 diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll index 122d69c20c49e..c8cc40faf1e84 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll @@ -1005,10 +1005,7 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { ; FAST90A-NEXT: v_accvgpr_write_b32 a3, s7 ; FAST90A-NEXT: s_nop 1 ; FAST90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v2, a[0:3] -; FAST90A-NEXT: v_mfma_f32_4x4x1f32 a[4:7], v1, v2, a[0:3] -; FAST90A-NEXT: s_nop 4 -; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a4 -; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a5 +; FAST90A-NEXT: v_mfma_f32_4x4x1f32 a[2:5], v1, v2, a[0:3] ; FAST90A-NEXT: s_nop 1 ; FAST90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v2, a[0:3] ; FAST90A-NEXT: s_nop 4 diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll index a29dc34c56d3a..71981e3599b87 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll @@ -20,7 +20,7 @@ ; CHECK-LABEL: {{^}}call_72xi32: -; GFX11-PAL: NumSgprs: 37 +; GFX11-PAL: NumSgprs: 40 ; GFX11-PAL-GCNTRACKERS: NumSgprs: 37 ; GFX11-PAL: NumVgprs: 64 ; GFX11-PAL-GCNTRACKERS: NumVgprs: 64 diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll index 840916aa63949..c253f42e0d3c8 100644 --- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -596,88 +596,84 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s2 ; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0 +; GFX9-FLATSCR-NEXT: v_and_b32_e32 v30, 0x1fc, v0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 ; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3f5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3f523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, 0x3f3d349e ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v29 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v28 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, 0xbf523be3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, 0x3f638e37 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbefcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbefcd8a3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v17 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xb7043519 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbe31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:224 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf638e39 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v22 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, v25 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:304 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[17:20], s0 offset:272 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:256 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 offset:240 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[22:25], s0 offset:208 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:192 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, v30 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3703c499 +; GFX9-FLATSCR-NEXT: scratch_load_dword v31, v0, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v3 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:720 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:832 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, v29 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3f3d349c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v28 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v11 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v13 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, v22 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, v27 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:784 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v12 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbf5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v17 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:736 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:816 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, v19 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v17 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], s0 offset:768 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:752 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:800 +; GFX9-FLATSCR-NEXT: v_add_u32_e32 v0, 0x200, v30 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:704 +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v0, off ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v31, v0 ; GFX9-FLATSCR-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-LABEL: ps_main: @@ -686,89 +682,85 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10-FLATSCR-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e ; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3f5f2ee2 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, 0x3f3d349e +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3f523be1 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, 0x3f638e37 ; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v29 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v28 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v27 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee3 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbefcd89f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbefcd8a3 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v17 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, 0xbf523be3 +; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v34, 0x200, v0 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], off offset:304 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f -; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v0 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[17:20], off offset:272 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xb7043519 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbe31934f ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v18 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf638e39 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, v22 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:224 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, v11 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, v25 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v17 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v11 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, 0xbf523be1 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, 0x3f3d349c -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v7 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v16 -; GFX10-FLATSCR-NEXT: scratch_load_dword v10, v35, off -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v21 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v5 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v20 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v6 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v6 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v18 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v8 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v27 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v14 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v12 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v11 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v9 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704 -; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v39, off +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], off offset:240 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[22:25], off offset:208 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:192 +; GFX10-FLATSCR-NEXT: scratch_load_dword v1, v5, off +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, 0xbf523be1 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0x3f3d349c +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, v28 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v11 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3703c499 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, v29 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, v6 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v13 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v12 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, v17 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, v3 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, v26 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v22 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v27 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:784 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, 0xbf5f2ee2 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v27 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v7 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, v8 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:736 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, v19 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, v17 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], off offset:816 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], off offset:800 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:768 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:752 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], off offset:720 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:704 +; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:832 +; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v34, off ; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v10, v0 +; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10-FLATSCR-NEXT: ; return to shader part epilog ; ; GFX9-FLATSCR-PAL-LABEL: ps_main: @@ -777,92 +769,88 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0 +; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v30, 0x1fc, v0 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3f5f2ee2 ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0 ; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, 0x3f523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, 0x3f3d349e ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v29 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v28 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, 0xbf523be3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, 0x3f638e37 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbefcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbefcd8a3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v17 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xb7043519 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbe31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbe319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:224 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf638e39 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v22 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, v25 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:304 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[17:20], s0 offset:272 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:256 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[6:9], s0 offset:240 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[22:25], s0 offset:208 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:192 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v30 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3703c499 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v31, v0, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v3 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:720 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:832 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, v29 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3f3d349c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v28 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v11 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v13 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v22 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v27 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:784 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v12 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbf5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v17 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:736 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:816 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, v19 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v17 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], s0 offset:768 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:752 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:800 +; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v0, 0x200, v30 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:704 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v0, off ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v31, v0 ; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-PAL-LABEL: ps_main: @@ -876,165 +864,158 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10-FLATSCR-PAL-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e ; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3f5f2ee2 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, 0x3f3d349e +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, 0x3f523be1 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, 0x3f638e37 ; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v29 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v28 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v27 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee3 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbefcd89f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbefcd8a3 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v17 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, 0xbf523be3 +; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v34, 0x200, v0 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], off offset:304 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f -; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v0 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[17:20], off offset:272 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xb7043519 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbe31934f ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbe319356 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v18 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf638e39 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v22 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:224 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, v11 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, v25 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v17 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v11 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, 0xbf523be1 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, 0x3f3d349c -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v7 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v16 -; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v10, v35, off -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v21 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v36, v5 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v37, v20 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v38, v6 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, v6 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v18 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v8 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v27 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v14 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v12 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v11 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v9 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704 -; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v39, off +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[6:9], off offset:240 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[22:25], off offset:208 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:192 +; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v1, v5, off +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, 0xbf523be1 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0x3f3d349c +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, v28 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v11 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3703c499 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, v29 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v6 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v13 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v12 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v17 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v3 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v26 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v22 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v27 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:784 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, 0xbf5f2ee2 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v27 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v7 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v8 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:736 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, v19 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v17 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], off offset:816 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], off offset:800 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:768 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:752 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], off offset:720 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:704 ; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v10, v0 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:832 +; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v34, off +; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10-FLATSCR-PAL-NEXT: ; return to shader part epilog ; ; GFX11-FLATSCR-LABEL: ps_main: ; GFX11-FLATSCR: ; %bb.0: -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e -; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3f3d349e +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v23, v21 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3f523be1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v28 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, 0x3f5f2ee2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v19, 0xbefcd89f :: v_dual_mov_b32 v30, v27 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v27 ; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f638e37 :: v_dual_mov_b32 v4, v26 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[25:28], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3efcd89f :: v_dual_mov_b32 v16, v24 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v18, v17 :: v_dual_and_b32 v33, 0x1fc, v0 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0xbefcd8a3 :: v_dual_mov_b32 v15, v11 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xb7043519 :: v_dual_mov_b32 v10, v13 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbe31934f :: v_dual_mov_b32 v31, v11 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 -; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 ; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[6:9], off offset:240 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[11:14], off offset:224 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v14, 0x3f20e7f5 :: v_dual_mov_b32 v9, v6 +; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:272 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:256 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 -; GFX11-FLATSCR-NEXT: s_clause 0x2 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:832 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[28:31], off offset:816 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:800 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v29, 0xbf523be1 :: v_dual_mov_b32 v30, v7 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v31, v17 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v12 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v28 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:192 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v33, off +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v29, 0xbf523be1 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v32, 0x3f3d349c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, v21 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[4:7], off offset:832 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v7, 0x3703c499 :: v_dual_mov_b32 v16, v28 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 -; GFX11-FLATSCR-NEXT: s_clause 0x4 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v26 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v23, v25 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, v26 :: v_dual_mov_b32 v18, v12 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, v17 :: v_dual_mov_b32 v21, v3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:736 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:736 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v5, v8 :: v_dual_mov_b32 v14, v17 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v7 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v13, v19 +; GFX11-FLATSCR-NEXT: s_clause 0x4 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[7:10], off offset:816 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[18:21], off offset:800 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[22:25], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[3:6], off offset:720 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:704 -; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v37, off offset:512 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[11:14], off offset:704 +; GFX11-FLATSCR-NEXT: scratch_load_b32 v1, v33, off offset:512 ; GFX11-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v14, v0 +; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-FLATSCR-NEXT: ; return to shader part epilog %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -1619,88 +1600,84 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s2 ; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0 +; GFX9-FLATSCR-NEXT: v_and_b32_e32 v30, 0x1fc, v0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 ; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3f5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3f523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, 0x3f3d349e ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v29 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v28 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, 0xbf523be3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, 0x3f638e37 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbefcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbefcd8a3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v17 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xb7043519 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbe31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:224 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf638e39 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v22 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, v25 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:304 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[17:20], s0 offset:272 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:256 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 offset:240 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[22:25], s0 offset:208 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:192 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, v30 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3703c499 +; GFX9-FLATSCR-NEXT: scratch_load_dword v31, v0, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v3 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:720 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:832 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, v29 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3f3d349c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v28 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v11 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v13 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, v22 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, v27 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:784 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v12 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbf5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v17 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:736 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:816 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, v19 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v17 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], s0 offset:768 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:752 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:800 +; GFX9-FLATSCR-NEXT: v_add_u32_e32 v0, 0x200, v30 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:704 +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v0, off ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v31, v0 ; GFX9-FLATSCR-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-LABEL: vs_main: @@ -1709,89 +1686,85 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10-FLATSCR-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e ; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3f5f2ee2 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, 0x3f3d349e +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3f523be1 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, 0x3f638e37 ; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v29 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v28 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v27 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee3 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbefcd89f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbefcd8a3 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v17 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, 0xbf523be3 +; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v34, 0x200, v0 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], off offset:304 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f -; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v0 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[17:20], off offset:272 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xb7043519 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbe31934f ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v18 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf638e39 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, v22 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:224 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, v11 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, v25 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v17 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v11 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, 0xbf523be1 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, 0x3f3d349c -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v7 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v16 -; GFX10-FLATSCR-NEXT: scratch_load_dword v10, v35, off -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v21 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v5 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v20 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v6 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v6 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v18 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v8 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v27 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v14 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v12 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v11 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v9 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704 -; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v39, off +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], off offset:240 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[22:25], off offset:208 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:192 +; GFX10-FLATSCR-NEXT: scratch_load_dword v1, v5, off +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, 0xbf523be1 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0x3f3d349c +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, v28 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v11 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3703c499 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, v29 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, v6 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v13 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v12 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, v17 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, v3 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, v26 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v22 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v27 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:784 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, 0xbf5f2ee2 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v27 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v7 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, v8 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:736 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, v19 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, v17 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], off offset:816 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], off offset:800 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:768 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:752 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], off offset:720 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:704 ; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v10, v0 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:832 +; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v34, off +; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10-FLATSCR-NEXT: ; return to shader part epilog ; ; GFX9-FLATSCR-PAL-LABEL: vs_main: @@ -1800,92 +1773,88 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0 +; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v30, 0x1fc, v0 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3f5f2ee2 ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0 ; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, 0x3f523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, 0x3f3d349e ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v29 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v28 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, 0xbf523be3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, 0x3f638e37 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbefcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbefcd8a3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v17 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xb7043519 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbe31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbe319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:224 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf638e39 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v22 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, v25 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:304 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[17:20], s0 offset:272 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:256 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[6:9], s0 offset:240 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[22:25], s0 offset:208 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:192 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v30 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3703c499 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v31, v0, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v3 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:720 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:832 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, v29 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3f3d349c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v28 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v11 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v13 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v22 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v27 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:784 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v12 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbf5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v17 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:736 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:816 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, v19 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v17 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], s0 offset:768 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:752 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:800 +; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v0, 0x200, v30 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:704 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v0, off ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v31, v0 ; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-PAL-LABEL: vs_main: @@ -1899,165 +1868,158 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10-FLATSCR-PAL-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e ; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3f5f2ee2 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, 0x3f3d349e +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, 0x3f523be1 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, 0x3f638e37 ; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v29 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v28 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v27 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee3 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbefcd89f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbefcd8a3 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v17 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, 0xbf523be3 +; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v34, 0x200, v0 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], off offset:304 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f -; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v0 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[17:20], off offset:272 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xb7043519 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbe31934f ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbe319356 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v18 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf638e39 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v22 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:224 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, v11 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, v25 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v17 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v11 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, 0xbf523be1 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, 0x3f3d349c -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v7 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v16 -; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v10, v35, off -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v21 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v36, v5 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v37, v20 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v38, v6 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, v6 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v18 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v8 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v27 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v14 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v12 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v11 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v9 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704 -; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v39, off +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[6:9], off offset:240 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[22:25], off offset:208 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:192 +; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v1, v5, off +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, 0xbf523be1 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0x3f3d349c +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, v28 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v11 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3703c499 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, v29 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v6 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v13 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v12 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v17 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v3 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v26 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v22 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v27 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:784 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, 0xbf5f2ee2 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v27 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v7 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v8 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:736 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, v19 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v17 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], off offset:816 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], off offset:800 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:768 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:752 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], off offset:720 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:704 +; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:832 +; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v34, off ; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v10, v0 +; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10-FLATSCR-PAL-NEXT: ; return to shader part epilog ; ; GFX11-FLATSCR-LABEL: vs_main: ; GFX11-FLATSCR: ; %bb.0: -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e -; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3f3d349e +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v23, v21 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3f523be1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v28 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, 0x3f5f2ee2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v19, 0xbefcd89f :: v_dual_mov_b32 v30, v27 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v27 ; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f638e37 :: v_dual_mov_b32 v4, v26 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[25:28], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3efcd89f :: v_dual_mov_b32 v16, v24 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v18, v17 :: v_dual_and_b32 v33, 0x1fc, v0 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0xbefcd8a3 :: v_dual_mov_b32 v15, v11 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xb7043519 :: v_dual_mov_b32 v10, v13 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbe31934f :: v_dual_mov_b32 v31, v11 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 -; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 ; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[6:9], off offset:240 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[11:14], off offset:224 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v14, 0x3f20e7f5 :: v_dual_mov_b32 v9, v6 +; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:272 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:256 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 -; GFX11-FLATSCR-NEXT: s_clause 0x2 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:832 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[28:31], off offset:816 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:800 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v29, 0xbf523be1 :: v_dual_mov_b32 v30, v7 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v31, v17 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v12 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v28 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:192 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v33, off +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v29, 0xbf523be1 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v32, 0x3f3d349c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, v21 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[4:7], off offset:832 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v7, 0x3703c499 :: v_dual_mov_b32 v16, v28 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 -; GFX11-FLATSCR-NEXT: s_clause 0x4 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v26 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v23, v25 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, v26 :: v_dual_mov_b32 v18, v12 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, v17 :: v_dual_mov_b32 v21, v3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:736 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:736 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v5, v8 :: v_dual_mov_b32 v14, v17 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v7 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v13, v19 +; GFX11-FLATSCR-NEXT: s_clause 0x4 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[7:10], off offset:816 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[18:21], off offset:800 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[22:25], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[3:6], off offset:720 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:704 -; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v37, off offset:512 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[11:14], off offset:704 +; GFX11-FLATSCR-NEXT: scratch_load_b32 v1, v33, off offset:512 ; GFX11-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v14, v0 +; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-FLATSCR-NEXT: ; return to shader part epilog %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -2640,90 +2602,86 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX9-FLATSCR-LABEL: cs_main: ; GFX9-FLATSCR: ; %bb.0: ; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s2 -; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0xbf20e7f4 ; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0x3f523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3f3d349e +; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:320 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbeae29dc +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0x3eae29dc +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0x3eae29d8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3e319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3e31934f ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, 0xbf523be3 +; GFX9-FLATSCR-NEXT: v_and_b32_e32 v31, 0x1fc, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0x3f638e37 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:288 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbefcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbefcd8a3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0xb702e758 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0xb7043519 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0xbe31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0xbe319356 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[10:13], s0 offset:240 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, 0xbf638e39 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, v26 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3f20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v15 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v29 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0x3703c499 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:304 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:272 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:256 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:208 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[17:20], s0 offset:192 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v31 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v11 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v12 +; GFX9-FLATSCR-NEXT: scratch_load_dword v32, v4, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, v7 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:720 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0xbf523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v15 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, v10 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v17 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, v26 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v30, v1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:784 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v16 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0xbf5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v21 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[17:20], s0 offset:736 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:816 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v23 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v21 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], s0 offset:768 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:832 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:752 +; GFX9-FLATSCR-NEXT: v_add_u32_e32 v0, 0x200, v31 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[22:25], s0 offset:800 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v0, off ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v32, v0 ; GFX9-FLATSCR-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-LABEL: cs_main: @@ -2732,89 +2690,85 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10-FLATSCR-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e ; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3f5f2ee2 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, 0x3f3d349e +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3f523be1 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, 0x3f638e37 ; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v29 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v28 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v27 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee3 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbefcd89f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbefcd8a3 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v17 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, 0xbf523be3 +; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v34, 0x200, v0 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], off offset:304 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f -; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v0 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[17:20], off offset:272 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xb7043519 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbe31934f ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v18 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf638e39 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, v22 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:224 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, v11 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, v25 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v17 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v11 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, 0xbf523be1 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, 0x3f3d349c -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v7 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v16 -; GFX10-FLATSCR-NEXT: scratch_load_dword v10, v35, off -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v21 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v5 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v20 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v6 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v6 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v18 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v8 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v27 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v14 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v12 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v11 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v9 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704 -; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v39, off +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], off offset:240 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[22:25], off offset:208 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:192 +; GFX10-FLATSCR-NEXT: scratch_load_dword v1, v5, off +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, 0xbf523be1 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0x3f3d349c +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, v28 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v11 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3703c499 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, v29 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, v6 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v13 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v12 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, v17 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, v3 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, v26 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v22 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v27 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:784 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, 0xbf5f2ee2 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v27 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v7 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, v8 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:736 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, v19 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, v17 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], off offset:816 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], off offset:800 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:768 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:752 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], off offset:720 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:704 +; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:832 +; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v34, off ; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v10, v0 +; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10-FLATSCR-NEXT: ; return to shader part epilog ; ; GFX9-FLATSCR-PAL-LABEL: cs_main: @@ -2822,93 +2776,89 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[2:3] ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x10 -; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0xbf20e7f4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0x3f523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0 ; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:320 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbeae29dc +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0x3eae29dc +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0x3eae29d8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3e319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3e31934f ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, 0xbf523be3 +; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v31, 0x1fc, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0x3f638e37 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:288 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbefcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0xbefcd8a3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v21 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0xb702e758 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0xb7043519 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0xbe31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0xbe319356 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[10:13], s0 offset:240 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0xbf638e39 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v26 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3f20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v15 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v29 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0x3703c499 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:304 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:272 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:256 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:208 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[17:20], s0 offset:192 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v31 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v11 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v12 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v32, v4, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v7 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:720 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0xbf523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v15 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, v10 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v17 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v26 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:784 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v16 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0xbf5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v21 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[17:20], s0 offset:736 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:816 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v23 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v21 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], s0 offset:768 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:832 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:752 +; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v0, 0x200, v31 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[22:25], s0 offset:800 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v0, off ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v32, v0 ; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-PAL-LABEL: cs_main: @@ -2922,165 +2872,158 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10-FLATSCR-PAL-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e ; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3f5f2ee2 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, 0x3f3d349e +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, 0x3f523be1 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, 0x3f638e37 ; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v29 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v28 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v27 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee3 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbefcd89f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbefcd8a3 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v17 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, 0xbf523be3 +; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v34, 0x200, v0 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], off offset:304 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f -; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v0 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[17:20], off offset:272 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xb7043519 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbe31934f ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbe319356 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v18 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf638e39 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v22 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:224 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, v11 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, v25 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v17 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v11 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, 0xbf523be1 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, 0x3f3d349c -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v7 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v16 -; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v10, v35, off -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v21 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v36, v5 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v37, v20 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v38, v6 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, v6 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v18 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v8 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v27 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v14 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v12 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v11 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v9 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704 -; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v39, off +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[6:9], off offset:240 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[22:25], off offset:208 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:192 +; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v1, v5, off +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, 0xbf523be1 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0x3f3d349c +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, v28 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v11 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3703c499 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, v29 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v6 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v13 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v12 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v17 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v3 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v26 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v22 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v27 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:784 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, 0xbf5f2ee2 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v27 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v7 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v8 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:736 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, v19 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v17 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], off offset:816 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], off offset:800 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:768 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:752 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], off offset:720 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:704 +; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:832 +; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v34, off ; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v10, v0 +; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10-FLATSCR-PAL-NEXT: ; return to shader part epilog ; ; GFX11-FLATSCR-LABEL: cs_main: ; GFX11-FLATSCR: ; %bb.0: -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e -; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3f3d349e +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v23, v21 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3f523be1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v28 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, 0x3f5f2ee2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v19, 0xbefcd89f :: v_dual_mov_b32 v30, v27 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v27 ; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f638e37 :: v_dual_mov_b32 v4, v26 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[25:28], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3efcd89f :: v_dual_mov_b32 v16, v24 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v18, v17 :: v_dual_and_b32 v33, 0x1fc, v0 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0xbefcd8a3 :: v_dual_mov_b32 v15, v11 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xb7043519 :: v_dual_mov_b32 v10, v13 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbe31934f :: v_dual_mov_b32 v31, v11 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 -; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 ; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[6:9], off offset:240 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[11:14], off offset:224 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v14, 0x3f20e7f5 :: v_dual_mov_b32 v9, v6 +; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:272 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:256 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 -; GFX11-FLATSCR-NEXT: s_clause 0x2 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:832 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[28:31], off offset:816 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:800 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v29, 0xbf523be1 :: v_dual_mov_b32 v30, v7 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v31, v17 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v12 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v28 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:192 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v33, off +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v29, 0xbf523be1 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v32, 0x3f3d349c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, v21 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[4:7], off offset:832 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v7, 0x3703c499 :: v_dual_mov_b32 v16, v28 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 -; GFX11-FLATSCR-NEXT: s_clause 0x4 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v26 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v23, v25 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, v26 :: v_dual_mov_b32 v18, v12 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, v17 :: v_dual_mov_b32 v21, v3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:736 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:736 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v5, v8 :: v_dual_mov_b32 v14, v17 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v7 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v13, v19 +; GFX11-FLATSCR-NEXT: s_clause 0x4 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[7:10], off offset:816 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[18:21], off offset:800 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[22:25], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[3:6], off offset:720 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:704 -; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v37, off offset:512 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[11:14], off offset:704 +; GFX11-FLATSCR-NEXT: scratch_load_b32 v1, v33, off offset:512 ; GFX11-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v14, v0 +; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-FLATSCR-NEXT: ; return to shader part epilog %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -3662,88 +3605,84 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s5 ; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0 +; GFX9-FLATSCR-NEXT: v_and_b32_e32 v30, 0x1fc, v0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 ; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3f5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3f523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, 0x3f3d349e ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v29 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v28 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, 0xbf523be3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, 0x3f638e37 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbefcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbefcd8a3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v17 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xb7043519 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbe31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:224 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf638e39 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v22 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, v25 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:304 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[17:20], s0 offset:272 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:256 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 offset:240 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[22:25], s0 offset:208 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:192 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, v30 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3703c499 +; GFX9-FLATSCR-NEXT: scratch_load_dword v31, v0, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v3 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:720 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:832 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, v29 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3f3d349c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v28 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v11 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v13 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, v22 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, v27 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:784 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v12 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbf5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v17 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:736 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:816 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, v19 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v17 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], s0 offset:768 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:752 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:800 +; GFX9-FLATSCR-NEXT: v_add_u32_e32 v0, 0x200, v30 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:704 +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v0, off ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v31, v0 ; GFX9-FLATSCR-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-LABEL: hs_main: @@ -3752,89 +3691,85 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10-FLATSCR-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e ; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3f5f2ee2 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, 0x3f3d349e +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3f523be1 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, 0x3f638e37 ; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v29 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v28 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v27 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee3 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbefcd89f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbefcd8a3 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v17 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, 0xbf523be3 +; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v34, 0x200, v0 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], off offset:304 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f -; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v0 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[17:20], off offset:272 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xb7043519 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbe31934f ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v18 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf638e39 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, v22 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:224 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, v11 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, v25 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v17 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v11 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, 0xbf523be1 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, 0x3f3d349c -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v7 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v16 -; GFX10-FLATSCR-NEXT: scratch_load_dword v10, v35, off -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v21 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v5 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v20 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v6 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v6 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v18 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v8 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v27 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v14 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v12 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v11 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v9 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704 -; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v39, off +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], off offset:240 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[22:25], off offset:208 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:192 +; GFX10-FLATSCR-NEXT: scratch_load_dword v1, v5, off +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, 0xbf523be1 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0x3f3d349c +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, v28 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v11 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3703c499 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, v29 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, v6 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v13 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v12 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, v17 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, v3 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, v26 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v22 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v27 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:784 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, 0xbf5f2ee2 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v27 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v7 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, v8 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:736 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, v19 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, v17 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], off offset:816 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], off offset:800 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:768 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:752 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], off offset:720 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:704 +; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:832 +; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v34, off ; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v10, v0 +; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10-FLATSCR-NEXT: ; return to shader part epilog ; ; GFX9-FLATSCR-PAL-LABEL: hs_main: @@ -3843,92 +3778,88 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8 ; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0 +; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v30, 0x1fc, v0 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3f5f2ee2 ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s0, s5 ; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, 0x3f523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, 0x3f3d349e ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v29 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v28 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, 0xbf523be3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, 0x3f638e37 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbefcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbefcd8a3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v17 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xb7043519 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbe31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbe319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:224 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf638e39 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v22 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, v25 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:304 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[17:20], s0 offset:272 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:256 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[6:9], s0 offset:240 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[22:25], s0 offset:208 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:192 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v30 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3703c499 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v31, v0, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v3 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:720 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:832 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, v29 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3f3d349c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v28 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v11 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v13 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v22 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v27 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:784 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v12 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbf5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v17 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:736 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:816 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, v19 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v17 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], s0 offset:768 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:752 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:800 +; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v0, 0x200, v30 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:704 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v0, off ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v31, v0 ; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-PAL-LABEL: hs_main: @@ -3942,165 +3873,158 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10-FLATSCR-PAL-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e ; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3f5f2ee2 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, 0x3f3d349e +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, 0x3f523be1 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, 0x3f638e37 ; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v29 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v28 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v27 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee3 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbefcd89f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbefcd8a3 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v17 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, 0xbf523be3 +; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v34, 0x200, v0 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], off offset:304 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f -; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v0 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[17:20], off offset:272 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xb7043519 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbe31934f ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbe319356 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v18 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf638e39 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v22 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:224 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, v11 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, v25 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v17 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v11 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, 0xbf523be1 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, 0x3f3d349c -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v7 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v16 -; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v10, v35, off -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v21 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v36, v5 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v37, v20 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v38, v6 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, v6 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v18 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v8 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v27 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v14 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v12 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v11 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v9 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704 -; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v39, off +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[6:9], off offset:240 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[22:25], off offset:208 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:192 +; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v1, v5, off +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, 0xbf523be1 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0x3f3d349c +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, v28 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v11 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3703c499 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, v29 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v6 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v13 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v12 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v17 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v3 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v26 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v22 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v27 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:784 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, 0xbf5f2ee2 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v27 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v7 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v8 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:736 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, v19 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v17 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], off offset:816 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], off offset:800 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:768 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:752 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], off offset:720 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:704 ; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v10, v0 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:832 +; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v34, off +; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10-FLATSCR-PAL-NEXT: ; return to shader part epilog ; ; GFX11-FLATSCR-LABEL: hs_main: ; GFX11-FLATSCR: ; %bb.0: -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e -; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3f3d349e +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v23, v21 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3f523be1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v28 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, 0x3f5f2ee2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v19, 0xbefcd89f :: v_dual_mov_b32 v30, v27 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v27 ; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f638e37 :: v_dual_mov_b32 v4, v26 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[25:28], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3efcd89f :: v_dual_mov_b32 v16, v24 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v18, v17 :: v_dual_and_b32 v33, 0x1fc, v0 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0xbefcd8a3 :: v_dual_mov_b32 v15, v11 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xb7043519 :: v_dual_mov_b32 v10, v13 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbe31934f :: v_dual_mov_b32 v31, v11 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 -; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 ; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[6:9], off offset:240 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[11:14], off offset:224 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v14, 0x3f20e7f5 :: v_dual_mov_b32 v9, v6 +; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:272 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:256 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 -; GFX11-FLATSCR-NEXT: s_clause 0x2 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:832 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[28:31], off offset:816 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:800 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v29, 0xbf523be1 :: v_dual_mov_b32 v30, v7 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v31, v17 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v12 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v28 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:192 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v33, off +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v29, 0xbf523be1 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v32, 0x3f3d349c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, v21 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[4:7], off offset:832 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v7, 0x3703c499 :: v_dual_mov_b32 v16, v28 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 -; GFX11-FLATSCR-NEXT: s_clause 0x4 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v26 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v23, v25 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, v26 :: v_dual_mov_b32 v18, v12 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, v17 :: v_dual_mov_b32 v21, v3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:736 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:736 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v5, v8 :: v_dual_mov_b32 v14, v17 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v7 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v13, v19 +; GFX11-FLATSCR-NEXT: s_clause 0x4 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[7:10], off offset:816 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[18:21], off offset:800 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[22:25], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[3:6], off offset:720 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:704 -; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v37, off offset:512 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[11:14], off offset:704 +; GFX11-FLATSCR-NEXT: scratch_load_b32 v1, v33, off offset:512 ; GFX11-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v14, v0 +; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-FLATSCR-NEXT: ; return to shader part epilog %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -4682,88 +4606,84 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s5 ; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0 +; GFX9-FLATSCR-NEXT: v_and_b32_e32 v30, 0x1fc, v0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 ; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3f5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3f523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, 0x3f3d349e ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v29 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v28 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, 0xbf523be3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, 0x3f638e37 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbefcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbefcd8a3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v17 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xb7043519 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbe31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:224 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf638e39 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v22 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, v25 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:304 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[17:20], s0 offset:272 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:256 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 offset:240 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[22:25], s0 offset:208 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:192 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, v30 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3703c499 +; GFX9-FLATSCR-NEXT: scratch_load_dword v31, v0, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v3 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:720 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:832 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, v29 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3f3d349c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v28 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v11 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v13 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, v22 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, v27 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:784 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v12 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbf5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v17 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:736 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:816 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, v19 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v17 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], s0 offset:768 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:752 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:800 +; GFX9-FLATSCR-NEXT: v_add_u32_e32 v0, 0x200, v30 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:704 +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v0, off ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v31, v0 ; GFX9-FLATSCR-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-LABEL: gs_main: @@ -4772,89 +4692,85 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10-FLATSCR-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e ; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3f5f2ee2 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, 0x3f3d349e +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3f523be1 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, 0x3f638e37 ; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v29 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v28 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v27 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee3 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbefcd89f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbefcd8a3 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v17 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, 0xbf523be3 +; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v34, 0x200, v0 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], off offset:304 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f -; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v0 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[17:20], off offset:272 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xb7043519 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbe31934f ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v18 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf638e39 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, v22 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:224 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, v11 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, v25 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v17 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v11 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, 0xbf523be1 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, 0x3f3d349c -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v7 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v16 -; GFX10-FLATSCR-NEXT: scratch_load_dword v10, v35, off -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v21 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v5 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v20 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v6 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v6 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v18 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v8 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v27 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v14 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v12 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v11 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v9 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704 -; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v39, off +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], off offset:240 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[22:25], off offset:208 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:192 +; GFX10-FLATSCR-NEXT: scratch_load_dword v1, v5, off +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, 0xbf523be1 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0x3f3d349c +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, v28 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v11 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3703c499 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, v29 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, v6 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v13 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v12 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, v17 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, v3 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, v26 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v22 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v27 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:784 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, 0xbf5f2ee2 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v27 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v7 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, v8 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:736 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, v19 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, v17 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], off offset:816 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], off offset:800 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:768 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:752 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], off offset:720 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:704 ; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v10, v0 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:832 +; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v34, off +; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10-FLATSCR-NEXT: ; return to shader part epilog ; ; GFX9-FLATSCR-PAL-LABEL: gs_main: @@ -4863,92 +4779,88 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8 ; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0 +; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v30, 0x1fc, v0 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3f5f2ee2 ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s0, s5 ; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, 0x3f523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, 0x3f3d349e ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v29 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v28 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, 0xbf523be3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, 0x3f638e37 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbefcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbefcd8a3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v17 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xb7043519 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbe31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbe319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:224 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf638e39 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v22 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, v25 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:304 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[17:20], s0 offset:272 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:256 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[6:9], s0 offset:240 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[22:25], s0 offset:208 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:192 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v30 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3703c499 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v31, v0, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v3 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:720 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:832 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, v29 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3f3d349c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v28 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v11 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v13 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v22 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v27 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:784 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v12 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbf5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v17 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:736 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:816 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, v19 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v17 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], s0 offset:768 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:752 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:800 +; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v0, 0x200, v30 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:704 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v0, off ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v31, v0 ; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-PAL-LABEL: gs_main: @@ -4962,165 +4874,158 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10-FLATSCR-PAL-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e ; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3f5f2ee2 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, 0x3f3d349e +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, 0x3f523be1 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, 0x3f638e37 ; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v29 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v28 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v27 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee3 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbefcd89f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbefcd8a3 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v17 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, 0xbf523be3 +; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v34, 0x200, v0 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], off offset:304 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f -; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v0 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[17:20], off offset:272 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xb7043519 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbe31934f ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbe319356 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v18 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf638e39 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v22 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:224 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, v11 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, v25 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v17 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v11 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, 0xbf523be1 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, 0x3f3d349c -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v7 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v16 -; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v10, v35, off -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v21 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v36, v5 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v37, v20 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v38, v6 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, v6 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v18 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v8 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v27 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v14 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v12 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v11 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v9 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704 -; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v39, off +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[6:9], off offset:240 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[22:25], off offset:208 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:192 +; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v1, v5, off +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, 0xbf523be1 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0x3f3d349c +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, v28 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v11 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3703c499 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, v29 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v6 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v13 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v12 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v17 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v3 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v26 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v22 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v27 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:784 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, 0xbf5f2ee2 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v27 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v7 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v8 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:736 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, v19 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v17 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], off offset:816 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], off offset:800 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:768 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:752 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], off offset:720 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:704 ; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v10, v0 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:832 +; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v34, off +; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10-FLATSCR-PAL-NEXT: ; return to shader part epilog ; ; GFX11-FLATSCR-LABEL: gs_main: ; GFX11-FLATSCR: ; %bb.0: -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e -; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3f3d349e +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v23, v21 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3f523be1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v28 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, 0x3f5f2ee2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v19, 0xbefcd89f :: v_dual_mov_b32 v30, v27 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v27 ; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f638e37 :: v_dual_mov_b32 v4, v26 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[25:28], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3efcd89f :: v_dual_mov_b32 v16, v24 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v18, v17 :: v_dual_and_b32 v33, 0x1fc, v0 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0xbefcd8a3 :: v_dual_mov_b32 v15, v11 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xb7043519 :: v_dual_mov_b32 v10, v13 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbe31934f :: v_dual_mov_b32 v31, v11 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 -; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 ; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[6:9], off offset:240 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[11:14], off offset:224 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v14, 0x3f20e7f5 :: v_dual_mov_b32 v9, v6 +; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:272 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:256 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 -; GFX11-FLATSCR-NEXT: s_clause 0x2 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:832 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[28:31], off offset:816 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:800 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v29, 0xbf523be1 :: v_dual_mov_b32 v30, v7 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v31, v17 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v12 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v28 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:192 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v33, off +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v29, 0xbf523be1 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v32, 0x3f3d349c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, v21 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[4:7], off offset:832 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v7, 0x3703c499 :: v_dual_mov_b32 v16, v28 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 -; GFX11-FLATSCR-NEXT: s_clause 0x4 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v26 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v23, v25 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, v26 :: v_dual_mov_b32 v18, v12 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, v17 :: v_dual_mov_b32 v21, v3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:736 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:736 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v5, v8 :: v_dual_mov_b32 v14, v17 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v7 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v13, v19 +; GFX11-FLATSCR-NEXT: s_clause 0x4 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[7:10], off offset:816 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[18:21], off offset:800 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[22:25], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[3:6], off offset:720 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:704 -; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v37, off offset:512 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[11:14], off offset:704 +; GFX11-FLATSCR-NEXT: scratch_load_b32 v1, v33, off offset:512 ; GFX11-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v14, v0 +; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-FLATSCR-NEXT: ; return to shader part epilog %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -5711,89 +5616,85 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s5 ; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0 +; GFX9-FLATSCR-NEXT: v_and_b32_e32 v30, 0x1fc, v0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 ; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3f5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3f523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, 0x3f3d349e ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v29 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v28 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, 0xbf523be3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, 0x3f638e37 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbefcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbefcd8a3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v17 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xb7043519 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbe31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:224 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf638e39 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v22 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, v25 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:304 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[17:20], s0 offset:272 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:256 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 offset:240 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[22:25], s0 offset:208 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:192 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, v30 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3703c499 +; GFX9-FLATSCR-NEXT: scratch_load_dword v31, v0, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v3 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:720 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:832 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, v29 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3f3d349c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v28 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v11 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v13 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, v22 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, v27 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:784 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v12 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbf5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v17 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:736 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:816 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, v19 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v17 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], s0 offset:768 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:752 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:800 +; GFX9-FLATSCR-NEXT: v_add_u32_e32 v0, 0x200, v30 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:704 +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v0, off ; GFX9-FLATSCR-NEXT: s_mov_b32 s2, s7 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v31, v0 ; GFX9-FLATSCR-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-LABEL: hs_ir_uses_scratch_offset: @@ -5802,90 +5703,86 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10-FLATSCR-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e ; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, 0x3f3d349e +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3f5f2ee2 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3f523be1 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, 0x3f638e37 ; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v29 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v28 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v27 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee3 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbefcd89f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbefcd8a3 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v17 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, 0xbf523be3 +; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v34, 0x200, v0 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], off offset:304 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f -; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v0 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[17:20], off offset:272 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xb7043519 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbe31934f ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v18 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf638e39 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, v22 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:224 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, v11 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, v25 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v17 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v11 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, 0xbf523be1 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, 0x3f3d349c -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v7 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v16 -; GFX10-FLATSCR-NEXT: scratch_load_dword v10, v35, off -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v21 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v5 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v20 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v6 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v6 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v18 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v8 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v27 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v14 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v12 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v11 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v9 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704 -; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v39, off +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], off offset:240 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[22:25], off offset:208 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:192 +; GFX10-FLATSCR-NEXT: scratch_load_dword v1, v5, off +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, 0xbf523be1 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0x3f3d349c +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, v28 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v11 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3703c499 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, v29 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, v6 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v13 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v12 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, v17 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, v3 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, v26 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v22 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v27 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:784 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, 0xbf5f2ee2 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v27 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v7 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, v8 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:736 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, v19 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, v17 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], off offset:816 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], off offset:800 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:768 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:752 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], off offset:720 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:704 ; GFX10-FLATSCR-NEXT: s_mov_b32 s2, s7 ; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v10, v0 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:832 +; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v34, off +; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10-FLATSCR-NEXT: ; return to shader part epilog ; ; GFX9-FLATSCR-PAL-LABEL: hs_ir_uses_scratch_offset: @@ -5894,93 +5791,89 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8 ; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0 +; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v30, 0x1fc, v0 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3f5f2ee2 ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s0, s5 ; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, 0x3f523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, 0x3f3d349e ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v29 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v28 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, 0xbf523be3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, 0x3f638e37 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbefcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbefcd8a3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v17 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xb7043519 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbe31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbe319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:224 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf638e39 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v22 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, v25 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:304 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[17:20], s0 offset:272 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:256 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[6:9], s0 offset:240 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[22:25], s0 offset:208 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:192 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v30 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3703c499 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v31, v0, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v3 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:720 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:832 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, v29 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3f3d349c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v28 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v11 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v13 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v22 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v27 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:784 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v12 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbf5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v17 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:736 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:816 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, v19 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v17 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], s0 offset:768 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:752 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:800 +; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v0, 0x200, v30 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:704 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v0, off ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s5 ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v31, v0 ; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-PAL-LABEL: hs_ir_uses_scratch_offset: @@ -5994,167 +5887,158 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10-FLATSCR-PAL-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e ; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, 0x3f3d349e +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3f5f2ee2 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, 0x3f523be1 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, 0x3f638e37 ; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v29 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v28 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v27 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee3 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbefcd89f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbefcd8a3 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v17 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, 0xbf523be3 +; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v34, 0x200, v0 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], off offset:304 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f -; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v0 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[17:20], off offset:272 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xb7043519 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbe31934f ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbe319356 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v18 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf638e39 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v22 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:224 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, v11 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, v25 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v17 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v11 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, 0xbf523be1 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, 0x3f3d349c -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v7 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v16 -; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v10, v35, off -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v21 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v36, v5 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v37, v20 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v38, v6 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, v6 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v18 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v8 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v27 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v14 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v12 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v11 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v9 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704 -; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v39, off +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[6:9], off offset:240 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[22:25], off offset:208 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:192 +; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v1, v5, off +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, 0xbf523be1 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0x3f3d349c +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, v28 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v11 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3703c499 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, v29 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v6 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v13 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v12 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v17 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v3 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v26 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v22 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v27 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:784 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, 0xbf5f2ee2 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v27 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v7 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v8 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:736 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, v19 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v17 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], off offset:816 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], off offset:800 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:768 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:752 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], off offset:720 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:704 ; GFX10-FLATSCR-PAL-NEXT: s_mov_b32 s2, s5 ; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v10, v0 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:832 +; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v34, off +; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10-FLATSCR-PAL-NEXT: ; return to shader part epilog ; ; GFX11-FLATSCR-LABEL: hs_ir_uses_scratch_offset: ; GFX11-FLATSCR: ; %bb.0: ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3f3d349e +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3f523be1 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e -; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, 0x3f5f2ee2 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, v28 :: v_dual_mov_b32 v19, 0xbefcd89f +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v27 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, 0xbf523be3 :: v_dual_mov_b32 v23, v21 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f638e37 :: v_dual_mov_b32 v4, v26 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v27 +; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v17, 0xbeae29dc :: v_dual_mov_b32 v16, v24 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[25:28], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v18, v17 :: v_dual_and_b32 v33, 0x1fc, v0 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0xbefcd8a3 :: v_dual_mov_b32 v15, v11 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xb7043519 :: v_dual_mov_b32 v10, v13 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbe31934f :: v_dual_mov_b32 v31, v11 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 -; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 ; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[6:9], off offset:240 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[11:14], off offset:224 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v14, 0x3f20e7f5 :: v_dual_mov_b32 v9, v6 +; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:272 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:256 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 -; GFX11-FLATSCR-NEXT: s_clause 0x2 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:832 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[28:31], off offset:816 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:800 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v29, 0xbf523be1 :: v_dual_mov_b32 v30, v7 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v31, v17 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v12 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v28 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:192 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v33, off +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v29, 0xbf523be1 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v32, 0x3f3d349c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, v21 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[4:7], off offset:832 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v7, 0x3703c499 :: v_dual_mov_b32 v16, v28 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 -; GFX11-FLATSCR-NEXT: s_clause 0x4 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v26 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v23, v25 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, v26 :: v_dual_mov_b32 v18, v12 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, v17 :: v_dual_mov_b32 v21, v3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:736 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:736 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v5, v8 :: v_dual_mov_b32 v14, v17 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v7 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v13, v19 +; GFX11-FLATSCR-NEXT: s_clause 0x4 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[7:10], off offset:816 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[18:21], off offset:800 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[22:25], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[3:6], off offset:720 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:704 -; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v37, off offset:512 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[11:14], off offset:704 +; GFX11-FLATSCR-NEXT: scratch_load_b32 v1, v33, off offset:512 ; GFX11-FLATSCR-NEXT: s_mov_b32 s2, s5 ; GFX11-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v14, v0 +; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-FLATSCR-NEXT: ; return to shader part epilog %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -6743,89 +6627,85 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s5 ; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0 +; GFX9-FLATSCR-NEXT: v_and_b32_e32 v30, 0x1fc, v0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 ; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3f5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3f523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, 0x3f3d349e ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v29 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v28 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, 0xbf523be3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, 0x3f638e37 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbefcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbefcd8a3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v17 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xb7043519 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbe31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:224 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf638e39 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v22 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, v25 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:304 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[17:20], s0 offset:272 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:256 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 offset:240 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[22:25], s0 offset:208 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:192 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, v30 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3703c499 +; GFX9-FLATSCR-NEXT: scratch_load_dword v31, v0, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v3 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:720 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:832 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, v29 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3f3d349c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v28 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v11 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v13 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, v22 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, v27 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:784 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v12 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbf5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v17 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:736 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:816 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, v19 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v17 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], s0 offset:768 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:752 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:800 +; GFX9-FLATSCR-NEXT: v_add_u32_e32 v0, 0x200, v30 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:704 +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v0, off ; GFX9-FLATSCR-NEXT: s_mov_b32 s2, s7 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v31, v0 ; GFX9-FLATSCR-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-LABEL: gs_ir_uses_scratch_offset: @@ -6834,90 +6714,86 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10-FLATSCR-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e ; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, 0x3f3d349e +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3f5f2ee2 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3f523be1 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, 0x3f638e37 ; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v29 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v28 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v27 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee3 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbefcd89f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbefcd8a3 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v17 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, 0xbf523be3 +; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v34, 0x200, v0 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], off offset:304 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f -; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v0 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[17:20], off offset:272 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xb7043519 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbe31934f ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v18 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf638e39 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, v22 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:224 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, v11 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, v25 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v17 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v11 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, 0xbf523be1 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, 0x3f3d349c -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v7 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v16 -; GFX10-FLATSCR-NEXT: scratch_load_dword v10, v35, off -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v21 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v5 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v20 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v6 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v6 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v18 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v8 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v27 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v14 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v12 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v11 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v9 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704 -; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v39, off +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], off offset:240 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[22:25], off offset:208 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:192 +; GFX10-FLATSCR-NEXT: scratch_load_dword v1, v5, off +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, 0xbf523be1 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0x3f3d349c +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, v28 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v11 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3703c499 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, v29 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, v6 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v13 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v12 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, v17 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, v3 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, v26 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v22 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v27 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:784 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, 0xbf5f2ee2 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v27 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v7 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, v8 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:736 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, v19 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, v17 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], off offset:816 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], off offset:800 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:768 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:752 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], off offset:720 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:704 ; GFX10-FLATSCR-NEXT: s_mov_b32 s2, s7 ; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v10, v0 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:832 +; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v34, off +; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10-FLATSCR-NEXT: ; return to shader part epilog ; ; GFX9-FLATSCR-PAL-LABEL: gs_ir_uses_scratch_offset: @@ -6926,93 +6802,89 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8 ; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0 +; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v30, 0x1fc, v0 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3f5f2ee2 ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s0, s5 ; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, 0x3f523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, 0x3f3d349e ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v29 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v28 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, 0xbf523be3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, 0x3f638e37 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbefcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbefcd8a3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v17 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xb7043519 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbe31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbe319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:224 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf638e39 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v22 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, v25 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:304 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[17:20], s0 offset:272 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:256 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[6:9], s0 offset:240 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[22:25], s0 offset:208 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:192 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v30 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3703c499 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v31, v0, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v3 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:720 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:832 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, v29 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3f3d349c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v28 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v11 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v13 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v22 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v27 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:784 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v12 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbf5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v17 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:736 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:816 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, v19 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v17 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], s0 offset:768 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:752 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:800 +; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v0, 0x200, v30 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], s0 offset:704 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v0, off ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s5 ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v31, v0 ; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-PAL-LABEL: gs_ir_uses_scratch_offset: @@ -7026,167 +6898,158 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10-FLATSCR-PAL-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e ; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, 0x3f3d349e +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3f5f2ee2 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, 0x3f523be1 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbeae29dc +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, 0x3f638e37 ; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v29 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v28 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v27 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0xbf5f2ee3 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbefcd89f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbefcd8a3 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v17 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, 0xbf523be3 +; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v34, 0x200, v0 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], off offset:304 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f -; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v0 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[17:20], off offset:272 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xb7043519 ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbe31934f ; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbe319356 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v18 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf638e39 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v22 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:224 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, v11 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, v25 ; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v17 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v11 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, 0xbf523be1 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, 0x3f3d349c -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v7 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v16 -; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v10, v35, off -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v21 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v36, v5 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v37, v20 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v38, v6 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, v6 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v18 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v8 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v27 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v14 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v12 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v11 -; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v9 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720 -; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704 -; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v39, off +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[6:9], off offset:240 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[22:25], off offset:208 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:192 +; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v1, v5, off +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, 0xbf523be1 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0x3f3d349c +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, v28 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v11 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3703c499 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, v29 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0x3f20e7f4 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v6 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v13 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v12 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v17 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v3 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v26 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v22 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v27 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:784 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, 0xbf5f2ee2 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v27 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v7 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v8 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], off offset:736 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, v19 +; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, v17 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], off offset:816 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], off offset:800 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:768 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[30:33], off offset:752 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], off offset:720 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[11:14], off offset:704 ; GFX10-FLATSCR-PAL-NEXT: s_mov_b32 s2, s5 ; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v10, v0 +; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:832 +; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v34, off +; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10-FLATSCR-PAL-NEXT: ; return to shader part epilog ; ; GFX11-FLATSCR-LABEL: gs_ir_uses_scratch_offset: ; GFX11-FLATSCR: ; %bb.0: ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3f3d349e +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3f523be1 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e -; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, 0x3f5f2ee2 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, v28 :: v_dual_mov_b32 v19, 0xbefcd89f +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v27 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, 0xbf523be3 :: v_dual_mov_b32 v23, v21 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f638e37 :: v_dual_mov_b32 v4, v26 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v27 +; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v17, 0xbeae29dc :: v_dual_mov_b32 v16, v24 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbf3d349e +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3eae29dc +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3eae29d8 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3e319356 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[25:28], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v18, v17 :: v_dual_and_b32 v33, 0x1fc, v0 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0xbefcd8a3 :: v_dual_mov_b32 v15, v11 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xb7043519 :: v_dual_mov_b32 v10, v13 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbe31934f :: v_dual_mov_b32 v31, v11 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 -; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 ; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[6:9], off offset:240 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[11:14], off offset:224 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v14, 0x3f20e7f5 :: v_dual_mov_b32 v9, v6 +; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:272 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:256 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 -; GFX11-FLATSCR-NEXT: s_clause 0x2 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:832 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[28:31], off offset:816 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:800 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v29, 0xbf523be1 :: v_dual_mov_b32 v30, v7 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v31, v17 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v12 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v28 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:192 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v33, off +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v29, 0xbf523be1 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v32, 0x3f3d349c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, v21 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[4:7], off offset:832 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v7, 0x3703c499 :: v_dual_mov_b32 v16, v28 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 -; GFX11-FLATSCR-NEXT: s_clause 0x4 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v26 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v23, v25 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, v26 :: v_dual_mov_b32 v18, v12 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, v17 :: v_dual_mov_b32 v21, v3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:736 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:736 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v5, v8 :: v_dual_mov_b32 v14, v17 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v7 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v13, v19 +; GFX11-FLATSCR-NEXT: s_clause 0x4 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[7:10], off offset:816 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[18:21], off offset:800 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[22:25], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[3:6], off offset:720 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:704 -; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v37, off offset:512 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[11:14], off offset:704 +; GFX11-FLATSCR-NEXT: scratch_load_b32 v1, v33, off offset:512 ; GFX11-FLATSCR-NEXT: s_mov_b32 s2, s5 ; GFX11-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v14, v0 +; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-FLATSCR-NEXT: ; return to shader part epilog %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll index 07ca294019341..f1d147947ccdf 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll @@ -138,12 +138,11 @@ define void @v_shuffle_v2f32_v4f32__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v4f32__3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -273,12 +272,11 @@ define void @v_shuffle_v2f32_v4f32__7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v4f32__7_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -316,16 +314,14 @@ define void @v_shuffle_v2f32_v4f32__7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v4f32__7_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v5, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -370,15 +366,14 @@ define void @v_shuffle_v2f32_v4f32__7_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v4f32__7_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v7, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -423,15 +418,14 @@ define void @v_shuffle_v2f32_v4f32__7_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v4f32__7_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: global_store_dwordx2 v7, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -476,15 +470,14 @@ define void @v_shuffle_v2f32_v4f32__7_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v4f32__7_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -529,13 +522,12 @@ define void @v_shuffle_v2f32_v4f32__7_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v4f32__7_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -575,12 +567,12 @@ define void @v_shuffle_v2f32_v4f32__7_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v4f32__7_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -618,12 +610,12 @@ define void @v_shuffle_v2f32_v4f32__7_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v4f32__7_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -661,12 +653,12 @@ define void @v_shuffle_v2f32_v4f32__7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v4f32__7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -704,12 +696,11 @@ define void @v_shuffle_v2f32_v4f32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v4f32__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -872,13 +863,12 @@ define void @v_shuffle_v2f32_v4f32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v4f32__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -917,12 +907,11 @@ define void @v_shuffle_v2f32_v4f32__4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v4f32__4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1227,12 +1216,12 @@ define void @v_shuffle_v2f32_v4f32__3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v4f32__3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1582,12 +1571,12 @@ define void @v_shuffle_v2f32_v4f32__3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v4f32__3_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1933,12 +1922,12 @@ define void @v_shuffle_v2f32_v4f32__3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v4f32__3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2250,12 +2239,11 @@ define void @v_shuffle_v2f32_v4f32__3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v4f32__3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2592,11 +2580,11 @@ define void @v_shuffle_v2f32_v4f32__3_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2972,11 +2960,11 @@ define void @v_shuffle_v2f32_v4f32__3_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3354,11 +3342,11 @@ define void @v_shuffle_v2f32_v4f32__3_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v8f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v8f32.ll index 3deb23ca5314b..c17adef30c77a 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v8f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v8f32.ll @@ -298,12 +298,11 @@ define void @v_shuffle_v2f32_v8f32__7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__7_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -597,12 +596,11 @@ define void @v_shuffle_v2f32_v8f32__15_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -640,16 +638,14 @@ define void @v_shuffle_v2f32_v8f32__15_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v9, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v8 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v9, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -694,15 +690,14 @@ define void @v_shuffle_v2f32_v8f32__15_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v15, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[7:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v9 -; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v15, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -747,15 +742,14 @@ define void @v_shuffle_v2f32_v8f32__15_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v11, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v10 -; GFX900-NEXT: global_store_dwordx2 v11, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v14, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -800,15 +794,14 @@ define void @v_shuffle_v2f32_v8f32__15_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v13, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[5:12] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v13, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -853,15 +846,14 @@ define void @v_shuffle_v2f32_v8f32__15_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v13, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v12 -; GFX900-NEXT: global_store_dwordx2 v13, v[3:4], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -906,15 +898,14 @@ define void @v_shuffle_v2f32_v8f32__15_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v11, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[3:10] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v13 -; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v11, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -959,15 +950,14 @@ define void @v_shuffle_v2f32_v8f32__15_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v15, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, v14 -; GFX900-NEXT: global_store_dwordx2 v15, v[5:6], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1012,15 +1002,14 @@ define void @v_shuffle_v2f32_v8f32__15_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[1:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v15 -; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1065,13 +1054,12 @@ define void @v_shuffle_v2f32_v8f32__15_8(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_8: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1111,12 +1099,12 @@ define void @v_shuffle_v2f32_v8f32__15_9(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_9: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v1 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1154,12 +1142,12 @@ define void @v_shuffle_v2f32_v8f32__15_10(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_10: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1199,12 +1187,12 @@ define void @v_shuffle_v2f32_v8f32__15_11(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_11: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v3 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1242,12 +1230,12 @@ define void @v_shuffle_v2f32_v8f32__15_12(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_12: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1287,12 +1275,12 @@ define void @v_shuffle_v2f32_v8f32__15_13(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_13: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v5 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1330,12 +1318,12 @@ define void @v_shuffle_v2f32_v8f32__15_14(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_14: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1373,12 +1361,12 @@ define void @v_shuffle_v2f32_v8f32__15_15(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_15: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v7 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1416,12 +1404,11 @@ define void @v_shuffle_v2f32_v8f32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[1:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1756,13 +1743,12 @@ define void @v_shuffle_v2f32_v8f32__7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__7_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1801,12 +1787,11 @@ define void @v_shuffle_v2f32_v8f32__8_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__8_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[1:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2491,12 +2476,12 @@ define void @v_shuffle_v2f32_v8f32__7_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__7_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v1 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3228,12 +3213,12 @@ define void @v_shuffle_v2f32_v8f32__7_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__7_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3961,12 +3946,12 @@ define void @v_shuffle_v2f32_v8f32__7_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__7_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v3 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4698,12 +4683,12 @@ define void @v_shuffle_v2f32_v8f32__7_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__7_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5431,12 +5416,12 @@ define void @v_shuffle_v2f32_v8f32__7_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__7_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v5 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6170,12 +6155,12 @@ define void @v_shuffle_v2f32_v8f32__7_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__7_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6901,12 +6886,12 @@ define void @v_shuffle_v2f32_v8f32__7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v7 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7590,12 +7575,11 @@ define void @v_shuffle_v2f32_v8f32__7_8(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v8f32__7_8: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8320,11 +8304,11 @@ define void @v_shuffle_v2f32_v8f32__7_9(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v7 -; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9084,11 +9068,11 @@ define void @v_shuffle_v2f32_v8f32__7_10(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: global_store_dwordx2 v16, v[9:10], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9852,11 +9836,11 @@ define void @v_shuffle_v2f32_v8f32__7_11(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v7 -; GFX900-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10616,11 +10600,11 @@ define void @v_shuffle_v2f32_v8f32__7_12(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v12 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v11, v7 -; GFX900-NEXT: global_store_dwordx2 v16, v[11:12], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11384,11 +11368,11 @@ define void @v_shuffle_v2f32_v8f32__7_13(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v7 -; GFX900-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12148,11 +12132,11 @@ define void @v_shuffle_v2f32_v8f32__7_14(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v14 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v13, v7 -; GFX900-NEXT: global_store_dwordx2 v16, v[13:14], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12918,11 +12902,11 @@ define void @v_shuffle_v2f32_v8f32__7_15(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, v7 -; GFX900-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v4i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v4i32.ll index 37df1b6a72e03..39c6a447788e4 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v4i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v4i32.ll @@ -138,12 +138,11 @@ define void @v_shuffle_v2i32_v4i32__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v4i32__3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -273,12 +272,11 @@ define void @v_shuffle_v2i32_v4i32__7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v4i32__7_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -316,16 +314,14 @@ define void @v_shuffle_v2i32_v4i32__7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v4i32__7_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v5, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -370,15 +366,14 @@ define void @v_shuffle_v2i32_v4i32__7_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v4i32__7_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v7, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -423,15 +418,14 @@ define void @v_shuffle_v2i32_v4i32__7_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v4i32__7_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: global_store_dwordx2 v7, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -476,15 +470,14 @@ define void @v_shuffle_v2i32_v4i32__7_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v4i32__7_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -529,13 +522,12 @@ define void @v_shuffle_v2i32_v4i32__7_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v4i32__7_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -575,12 +567,12 @@ define void @v_shuffle_v2i32_v4i32__7_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v4i32__7_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -618,12 +610,12 @@ define void @v_shuffle_v2i32_v4i32__7_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v4i32__7_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -661,12 +653,12 @@ define void @v_shuffle_v2i32_v4i32__7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v4i32__7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -704,12 +696,11 @@ define void @v_shuffle_v2i32_v4i32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v4i32__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -872,13 +863,12 @@ define void @v_shuffle_v2i32_v4i32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v4i32__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -917,12 +907,11 @@ define void @v_shuffle_v2i32_v4i32__4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v4i32__4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1227,12 +1216,12 @@ define void @v_shuffle_v2i32_v4i32__3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v4i32__3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1582,12 +1571,12 @@ define void @v_shuffle_v2i32_v4i32__3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v4i32__3_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1933,12 +1922,12 @@ define void @v_shuffle_v2i32_v4i32__3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v4i32__3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2250,12 +2239,11 @@ define void @v_shuffle_v2i32_v4i32__3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v4i32__3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2592,11 +2580,11 @@ define void @v_shuffle_v2i32_v4i32__3_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2972,11 +2960,11 @@ define void @v_shuffle_v2i32_v4i32__3_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3354,11 +3342,11 @@ define void @v_shuffle_v2i32_v4i32__3_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v8i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v8i32.ll index 94ee1774c2766..0917a6ecaa5c4 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v8i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v8i32.ll @@ -298,12 +298,11 @@ define void @v_shuffle_v2i32_v8i32__7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__7_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -597,12 +596,11 @@ define void @v_shuffle_v2i32_v8i32__15_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -640,16 +638,14 @@ define void @v_shuffle_v2i32_v8i32__15_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v9, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v8 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v9, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -694,15 +690,14 @@ define void @v_shuffle_v2i32_v8i32__15_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v15, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[7:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v9 -; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v15, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -747,15 +742,14 @@ define void @v_shuffle_v2i32_v8i32__15_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v11, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v10 -; GFX900-NEXT: global_store_dwordx2 v11, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v14, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -800,15 +794,14 @@ define void @v_shuffle_v2i32_v8i32__15_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v13, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[5:12] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v13, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -853,15 +846,14 @@ define void @v_shuffle_v2i32_v8i32__15_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v13, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v12 -; GFX900-NEXT: global_store_dwordx2 v13, v[3:4], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -906,15 +898,14 @@ define void @v_shuffle_v2i32_v8i32__15_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v11, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[3:10] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v13 -; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v11, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -959,15 +950,14 @@ define void @v_shuffle_v2i32_v8i32__15_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v15, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, v14 -; GFX900-NEXT: global_store_dwordx2 v15, v[5:6], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1012,15 +1002,14 @@ define void @v_shuffle_v2i32_v8i32__15_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[1:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v15 -; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1065,13 +1054,12 @@ define void @v_shuffle_v2i32_v8i32__15_8(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_8: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1111,12 +1099,12 @@ define void @v_shuffle_v2i32_v8i32__15_9(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_9: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v1 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1154,12 +1142,12 @@ define void @v_shuffle_v2i32_v8i32__15_10(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_10: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1199,12 +1187,12 @@ define void @v_shuffle_v2i32_v8i32__15_11(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_11: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v3 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1242,12 +1230,12 @@ define void @v_shuffle_v2i32_v8i32__15_12(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_12: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1287,12 +1275,12 @@ define void @v_shuffle_v2i32_v8i32__15_13(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_13: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v5 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1330,12 +1318,12 @@ define void @v_shuffle_v2i32_v8i32__15_14(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_14: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1373,12 +1361,12 @@ define void @v_shuffle_v2i32_v8i32__15_15(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_15: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v7 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1416,12 +1404,11 @@ define void @v_shuffle_v2i32_v8i32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[1:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1756,13 +1743,12 @@ define void @v_shuffle_v2i32_v8i32__7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__7_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1801,12 +1787,11 @@ define void @v_shuffle_v2i32_v8i32__8_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__8_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[1:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2491,12 +2476,12 @@ define void @v_shuffle_v2i32_v8i32__7_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__7_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v1 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3228,12 +3213,12 @@ define void @v_shuffle_v2i32_v8i32__7_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__7_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3961,12 +3946,12 @@ define void @v_shuffle_v2i32_v8i32__7_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__7_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v3 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4698,12 +4683,12 @@ define void @v_shuffle_v2i32_v8i32__7_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__7_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5431,12 +5416,12 @@ define void @v_shuffle_v2i32_v8i32__7_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__7_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v5 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6170,12 +6155,12 @@ define void @v_shuffle_v2i32_v8i32__7_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__7_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6901,12 +6886,12 @@ define void @v_shuffle_v2i32_v8i32__7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v7 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7590,12 +7575,11 @@ define void @v_shuffle_v2i32_v8i32__7_8(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v8i32__7_8: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8320,11 +8304,11 @@ define void @v_shuffle_v2i32_v8i32__7_9(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v7 -; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9084,11 +9068,11 @@ define void @v_shuffle_v2i32_v8i32__7_10(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: global_store_dwordx2 v16, v[9:10], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9852,11 +9836,11 @@ define void @v_shuffle_v2i32_v8i32__7_11(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v7 -; GFX900-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10616,11 +10600,11 @@ define void @v_shuffle_v2i32_v8i32__7_12(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v12 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v11, v7 -; GFX900-NEXT: global_store_dwordx2 v16, v[11:12], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11384,11 +11368,11 @@ define void @v_shuffle_v2i32_v8i32__7_13(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v7 -; GFX900-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12148,11 +12132,11 @@ define void @v_shuffle_v2i32_v8i32__7_14(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v14 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v13, v7 -; GFX900-NEXT: global_store_dwordx2 v16, v[13:14], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12918,11 +12902,11 @@ define void @v_shuffle_v2i32_v8i32__7_15(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, v7 -; GFX900-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll index 0b20caea9cd95..1df6f21f15594 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll @@ -58,39 +58,33 @@ define void @v_shuffle_v2i64_v2i64__1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v2i64__1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v2i64__1_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v2i64__1_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -114,39 +108,33 @@ define void @v_shuffle_v2i64_v2i64__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v2i64__3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v2i64__3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v2i64__3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -160,55 +148,42 @@ define void @v_shuffle_v2i64_v2i64__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v2i64__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v2i64__3_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v2i64__3_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -222,49 +197,43 @@ define void @v_shuffle_v2i64_v2i64__3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v2i64__3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v2i64__3_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v2i64__3_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -291,31 +260,27 @@ define void @v_shuffle_v2i64_v2i64__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2i64_v2i64__3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v2i64__3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -329,39 +294,40 @@ define void @v_shuffle_v2i64_v2i64__3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v2i64__3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v2i64__3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v2i64__3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -375,39 +341,33 @@ define void @v_shuffle_v2i64_v2i64__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v2i64__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v2i64__u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v2i64__u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -478,31 +438,27 @@ define void @v_shuffle_v2i64_v2i64__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2i64_v2i64__1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v2i64__1_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -515,39 +471,33 @@ define void @v_shuffle_v2i64_v2i64__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v2i64__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v2i64__2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v2i64__2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -638,39 +588,40 @@ define void @v_shuffle_v2i64_v2i64__1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v2i64__1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v2i64__1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v2i64__1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -772,39 +723,33 @@ define void @v_shuffle_v2i64_v2i64__1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v2i64__1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v2i64__1_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v2i64__1_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -928,12 +873,12 @@ define void @v_shuffle_v2i64_v2i64__1_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -944,12 +889,12 @@ define void @v_shuffle_v2i64_v2i64__1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -960,13 +905,12 @@ define void @v_shuffle_v2i64_v2i64__1_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v3i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v3i64.ll index bc8a56a30d8f9..13b16f778aa97 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v3i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v3i64.ll @@ -97,39 +97,33 @@ define void @v_shuffle_v2i64_v3i64__2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v3i64__2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v3i64__2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v3i64__2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -193,39 +187,33 @@ define void @v_shuffle_v2i64_v3i64__5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v3i64__5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v3i64__5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v3i64__5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -239,55 +227,42 @@ define void @v_shuffle_v2i64_v3i64__5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v3i64__5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v3i64__5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v3i64__5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -301,49 +276,43 @@ define void @v_shuffle_v2i64_v3i64__5_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v3i64__5_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v3i64__5_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v3i64__5_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -357,49 +326,43 @@ define void @v_shuffle_v2i64_v3i64__5_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v3i64__5_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v3i64__5_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v3i64__5_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -413,45 +376,40 @@ define void @v_shuffle_v2i64_v3i64__5_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v3i64__5_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v3i64__5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v3i64__5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -465,39 +423,40 @@ define void @v_shuffle_v2i64_v3i64__5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v3i64__5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v3i64__5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v3i64__5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -511,39 +470,40 @@ define void @v_shuffle_v2i64_v3i64__5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v3i64__5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v3i64__5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v3i64__5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -557,39 +517,33 @@ define void @v_shuffle_v2i64_v3i64__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v3i64__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v3i64__u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v3i64__u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -692,45 +646,40 @@ define void @v_shuffle_v2i64_v3i64__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v3i64__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v3i64__2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v3i64__2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -743,39 +692,33 @@ define void @v_shuffle_v2i64_v3i64__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v3i64__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v3i64__3_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v3i64__3_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -967,39 +910,40 @@ define void @v_shuffle_v2i64_v3i64__2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v3i64__2_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v3i64__2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v3i64__2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1230,39 +1174,40 @@ define void @v_shuffle_v2i64_v3i64__2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v3i64__2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v3i64__2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v3i64__2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1459,39 +1404,33 @@ define void @v_shuffle_v2i64_v3i64__2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v3i64__2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v3i64__2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v3i64__2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1717,12 +1656,12 @@ define void @v_shuffle_v2i64_v3i64__2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1733,12 +1672,12 @@ define void @v_shuffle_v2i64_v3i64__2_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1749,13 +1688,12 @@ define void @v_shuffle_v2i64_v3i64__2_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v8 +; GFX942-NEXT: v_mov_b32_e32 v7, v9 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -2011,12 +1949,12 @@ define void @v_shuffle_v2i64_v3i64__2_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2027,12 +1965,12 @@ define void @v_shuffle_v2i64_v3i64__2_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2043,13 +1981,12 @@ define void @v_shuffle_v2i64_v3i64__2_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v10 +; GFX942-NEXT: v_mov_b32_e32 v7, v11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -2271,10 +2208,9 @@ define void @s_shuffle_v2i64_v3i64__2_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND @@ -2372,10 +2308,9 @@ define void @s_shuffle_v2i64_v3i64__5_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND @@ -2392,15 +2327,13 @@ define void @s_shuffle_v2i64_v3i64__5_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -2410,15 +2343,13 @@ define void @s_shuffle_v2i64_v3i64__5_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -2481,11 +2412,11 @@ define void @s_shuffle_v2i64_v3i64__5_1() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND @@ -2536,13 +2467,11 @@ define void @s_shuffle_v2i64_v3i64__5_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND @@ -2555,46 +2484,18 @@ define void @s_shuffle_v2i64_v3i64__5_2() { } define void @s_shuffle_v2i64_v3i64__5_3() { -; GFX900-LABEL: s_shuffle_v2i64_v3i64__5_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v2i64_v3i64__5_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v2i64_v3i64__5_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v2i64_v3i64__5_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> @@ -2607,10 +2508,10 @@ define void @s_shuffle_v2i64_v3i64__5_4() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:11] ; GFX9-NEXT: ;;#ASMEND @@ -2623,50 +2524,18 @@ define void @s_shuffle_v2i64_v3i64__5_4() { } define void @s_shuffle_v2i64_v3i64__5_5() { -; GFX900-LABEL: s_shuffle_v2i64_v3i64__5_5: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v2i64_v3i64__5_5: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v2i64_v3i64__5_5: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v2i64_v3i64__5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> @@ -2790,46 +2659,18 @@ define void @s_shuffle_v2i64_v3i64__1_0() { } define void @s_shuffle_v2i64_v3i64__2_0() { -; GFX900-LABEL: s_shuffle_v2i64_v3i64__2_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v2i64_v3i64__2_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v2i64_v3i64__2_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v2i64_v3i64__2_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -3046,10 +2887,10 @@ define void @s_shuffle_v2i64_v3i64__2_1() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:11] ; GFX9-NEXT: ;;#ASMEND @@ -3271,50 +3112,18 @@ define void @s_shuffle_v2i64_v3i64__1_2() { } define void @s_shuffle_v2i64_v3i64__2_2() { -; GFX900-LABEL: s_shuffle_v2i64_v3i64__2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v2i64_v3i64__2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v2i64_v3i64__2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v2i64_v3i64__2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -3553,10 +3362,9 @@ define void @s_shuffle_v2i64_v3i64__2_3() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND @@ -3789,13 +3597,13 @@ define void @s_shuffle_v2i64_v3i64__2_4() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -3805,13 +3613,13 @@ define void @s_shuffle_v2i64_v3i64__2_4() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -3821,13 +3629,14 @@ define void @s_shuffle_v2i64_v3i64__2_4() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_mov_b32 s10, s2 +; GFX942-NEXT: s_mov_b32 s11, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND @@ -4099,15 +3908,14 @@ define void @s_shuffle_v2i64_v3i64__2_5() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll index dd42a1dd44320..e756a7ae1682d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll @@ -136,39 +136,33 @@ define void @v_shuffle_v2i64_v4i64__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v4i64__3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v4i64__3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v4i64__3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -272,39 +266,33 @@ define void @v_shuffle_v2i64_v4i64__7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v4i64__7_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v4i64__7_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v4i64__7_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -318,55 +306,42 @@ define void @v_shuffle_v2i64_v4i64__7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v4i64__7_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v4i64__7_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v4i64__7_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -380,49 +355,43 @@ define void @v_shuffle_v2i64_v4i64__7_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v4i64__7_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v4i64__7_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v4i64__7_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -436,49 +405,43 @@ define void @v_shuffle_v2i64_v4i64__7_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v4i64__7_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v4i64__7_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v4i64__7_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -492,49 +455,43 @@ define void @v_shuffle_v2i64_v4i64__7_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v4i64__7_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v4i64__7_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v4i64__7_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -548,45 +505,40 @@ define void @v_shuffle_v2i64_v4i64__7_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v4i64__7_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v4i64__7_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v4i64__7_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -600,39 +552,40 @@ define void @v_shuffle_v2i64_v4i64__7_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v4i64__7_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v4i64__7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v4i64__7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -646,39 +599,40 @@ define void @v_shuffle_v2i64_v4i64__7_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v4i64__7_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v4i64__7_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v4i64__7_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -692,39 +646,40 @@ define void @v_shuffle_v2i64_v4i64__7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v4i64__7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v4i64__7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v4i64__7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -738,39 +693,33 @@ define void @v_shuffle_v2i64_v4i64__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v4i64__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v4i64__u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v4i64__u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -918,45 +867,40 @@ define void @v_shuffle_v2i64_v4i64__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v4i64__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v4i64__3_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v4i64__3_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -969,39 +913,33 @@ define void @v_shuffle_v2i64_v4i64__4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v4i64__4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v4i64__4_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v4i64__4_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1294,39 +1232,40 @@ define void @v_shuffle_v2i64_v4i64__3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v4i64__3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v4i64__3_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v4i64__3_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1658,39 +1597,40 @@ define void @v_shuffle_v2i64_v4i64__3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v4i64__3_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v4i64__3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v4i64__3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2022,39 +1962,40 @@ define void @v_shuffle_v2i64_v4i64__3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v4i64__3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v4i64__3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v4i64__3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2346,39 +2287,33 @@ define void @v_shuffle_v2i64_v4i64__3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v4i64__3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v4i64__3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v4i64__3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2706,12 +2641,12 @@ define void @v_shuffle_v2i64_v4i64__3_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2722,12 +2657,12 @@ define void @v_shuffle_v2i64_v4i64__3_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v6 -; GFX90A-NEXT: v_mov_b32_e32 v9, v7 -; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2738,13 +2673,12 @@ define void @v_shuffle_v2i64_v4i64__3_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v10 +; GFX942-NEXT: v_mov_b32_e32 v9, v11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v6 -; GFX942-NEXT: v_mov_b32_e32 v9, v7 -; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -3102,12 +3036,12 @@ define void @v_shuffle_v2i64_v4i64__3_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v6 -; GFX900-NEXT: v_mov_b32_e32 v11, v7 -; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3118,12 +3052,12 @@ define void @v_shuffle_v2i64_v4i64__3_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v12 +; GFX90A-NEXT: v_mov_b32_e32 v9, v13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, v6 -; GFX90A-NEXT: v_mov_b32_e32 v11, v7 -; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3134,13 +3068,12 @@ define void @v_shuffle_v2i64_v4i64__3_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v12 +; GFX942-NEXT: v_mov_b32_e32 v9, v13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v6 -; GFX942-NEXT: v_mov_b32_e32 v11, v7 -; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -3498,12 +3431,12 @@ define void @v_shuffle_v2i64_v4i64__3_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v14 +; GFX900-NEXT: v_mov_b32_e32 v9, v15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v6 -; GFX900-NEXT: v_mov_b32_e32 v13, v7 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3514,12 +3447,12 @@ define void @v_shuffle_v2i64_v4i64__3_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v14 +; GFX90A-NEXT: v_mov_b32_e32 v9, v15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, v6 -; GFX90A-NEXT: v_mov_b32_e32 v13, v7 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3530,13 +3463,12 @@ define void @v_shuffle_v2i64_v4i64__3_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v14 +; GFX942-NEXT: v_mov_b32_e32 v9, v15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v12, v6 -; GFX942-NEXT: v_mov_b32_e32 v13, v7 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll index 7ee7c83e0122d..1c2215d39dc02 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll @@ -292,39 +292,33 @@ define void @v_shuffle_v2i64_v8i64__7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v14 -; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__7_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v14 -; GFX90A-NEXT: v_mov_b32_e32 v1, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__7_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v14 -; GFX942-NEXT: v_mov_b32_e32 v1, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -588,39 +582,33 @@ define void @v_shuffle_v2i64_v8i64__15_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v14 -; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v14 -; GFX90A-NEXT: v_mov_b32_e32 v1, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v14 -; GFX942-NEXT: v_mov_b32_e32 v1, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -637,52 +625,39 @@ define void @v_shuffle_v2i64_v8i64__15_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, v14 -; GFX900-NEXT: v_mov_b32_e32 v17, v15 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[16:31] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v18, v0 -; GFX900-NEXT: v_mov_b32_e32 v19, v1 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[16:19], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ; def v[16:31] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:17] +; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v18, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v16 -; GFX90A-NEXT: v_mov_b32_e32 v3, v17 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v18, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v32, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v32, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ; def v[16:31] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:17] +; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v16 -; GFX942-NEXT: v_mov_b32_e32 v3, v17 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v18, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v32, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -697,48 +672,42 @@ define void @v_shuffle_v2i64_v8i64__15_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:19] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v18 -; GFX900-NEXT: v_mov_b32_e32 v1, v19 -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v30, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ; def v[14:29] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v20, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:19] +; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v18 -; GFX90A-NEXT: v_mov_b32_e32 v1, v19 -; GFX90A-NEXT: global_store_dwordx4 v20, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v30, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v30, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ; def v[14:29] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v20, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:19] +; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v18 -; GFX942-NEXT: v_mov_b32_e32 v1, v19 -; GFX942-NEXT: global_store_dwordx4 v20, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v30, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -753,48 +722,42 @@ define void @v_shuffle_v2i64_v8i64__15_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:21] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v20 -; GFX900-NEXT: v_mov_b32_e32 v3, v21 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v28, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ; def v[12:27] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v22, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:21] +; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v20 -; GFX90A-NEXT: v_mov_b32_e32 v3, v21 -; GFX90A-NEXT: global_store_dwordx4 v22, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v28, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v28, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ; def v[12:27] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v22, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:21] +; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v20 -; GFX942-NEXT: v_mov_b32_e32 v3, v21 -; GFX942-NEXT: global_store_dwordx4 v22, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v28, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -809,48 +772,42 @@ define void @v_shuffle_v2i64_v8i64__15_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:23] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v22 -; GFX900-NEXT: v_mov_b32_e32 v5, v23 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v26, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ; def v[10:25] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v24, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:23] +; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v22 -; GFX90A-NEXT: v_mov_b32_e32 v5, v23 -; GFX90A-NEXT: global_store_dwordx4 v24, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v26, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v26, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ; def v[10:25] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v24, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:23] +; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v22 -; GFX942-NEXT: v_mov_b32_e32 v5, v23 -; GFX942-NEXT: global_store_dwordx4 v24, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v26, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -865,48 +822,42 @@ define void @v_shuffle_v2i64_v8i64__15_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[10:25] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v24 -; GFX900-NEXT: v_mov_b32_e32 v7, v25 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v24, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ; def v[8:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v26, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[10:25] +; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v24 -; GFX90A-NEXT: v_mov_b32_e32 v7, v25 -; GFX90A-NEXT: global_store_dwordx4 v26, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v24, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v24, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ; def v[8:23] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v26, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[10:25] +; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v24 -; GFX942-NEXT: v_mov_b32_e32 v7, v25 -; GFX942-NEXT: global_store_dwordx4 v26, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v24, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -921,48 +872,42 @@ define void @v_shuffle_v2i64_v8i64__15_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[12:27] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v26 -; GFX900-NEXT: v_mov_b32_e32 v9, v27 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v22, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ; def v[6:21] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v28, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[12:27] +; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v26 -; GFX90A-NEXT: v_mov_b32_e32 v9, v27 -; GFX90A-NEXT: global_store_dwordx4 v28, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v22, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v22, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ; def v[6:21] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v28, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[12:27] +; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v26 -; GFX942-NEXT: v_mov_b32_e32 v9, v27 -; GFX942-NEXT: global_store_dwordx4 v28, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v22, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -977,48 +922,42 @@ define void @v_shuffle_v2i64_v8i64__15_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[14:29] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v28 -; GFX900-NEXT: v_mov_b32_e32 v11, v29 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v20, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ; def v[4:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v30, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[14:29] +; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, v28 -; GFX90A-NEXT: v_mov_b32_e32 v11, v29 -; GFX90A-NEXT: global_store_dwordx4 v30, v[10:13], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v20, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v20, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ; def v[4:19] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v30, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[14:29] +; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v28 -; GFX942-NEXT: v_mov_b32_e32 v11, v29 -; GFX942-NEXT: global_store_dwordx4 v30, v[10:13], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v20, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -1033,48 +972,42 @@ define void @v_shuffle_v2i64_v8i64__15_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[16:31] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v30 -; GFX900-NEXT: v_mov_b32_e32 v13, v31 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ; def v[2:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v32, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[16:31] +; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, v30 -; GFX90A-NEXT: v_mov_b32_e32 v13, v31 -; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ; def v[2:17] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v32, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[16:31] +; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v12, v30 -; GFX942-NEXT: v_mov_b32_e32 v13, v31 -; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -1091,42 +1024,37 @@ define void @v_shuffle_v2i64_v8i64__15_8(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v14 -; GFX900-NEXT: v_mov_b32_e32 v3, v15 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v16, v0 +; GFX900-NEXT: v_mov_b32_e32 v17, v1 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_8: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v14 -; GFX90A-NEXT: v_mov_b32_e32 v3, v15 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v16, v0 +; GFX90A-NEXT: v_mov_b32_e32 v17, v1 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_8: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v14 -; GFX942-NEXT: v_mov_b32_e32 v3, v15 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v0 +; GFX942-NEXT: v_mov_b32_e32 v17, v1 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -1143,36 +1071,37 @@ define void @v_shuffle_v2i64_v8i64__15_9(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v14 -; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, v2 +; GFX900-NEXT: v_mov_b32_e32 v17, v3 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_9: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v14 -; GFX90A-NEXT: v_mov_b32_e32 v1, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v16, v2 +; GFX90A-NEXT: v_mov_b32_e32 v17, v3 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_9: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v14 -; GFX942-NEXT: v_mov_b32_e32 v1, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v2 +; GFX942-NEXT: v_mov_b32_e32 v17, v3 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -1189,36 +1118,37 @@ define void @v_shuffle_v2i64_v8i64__15_10(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v14 -; GFX900-NEXT: v_mov_b32_e32 v3, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, v4 +; GFX900-NEXT: v_mov_b32_e32 v17, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_10: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v14 -; GFX90A-NEXT: v_mov_b32_e32 v3, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v16, v4 +; GFX90A-NEXT: v_mov_b32_e32 v17, v5 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_10: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v14 -; GFX942-NEXT: v_mov_b32_e32 v3, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v4 +; GFX942-NEXT: v_mov_b32_e32 v17, v5 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -1235,36 +1165,37 @@ define void @v_shuffle_v2i64_v8i64__15_11(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, v6 +; GFX900-NEXT: v_mov_b32_e32 v17, v7 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_11: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v16, v6 +; GFX90A-NEXT: v_mov_b32_e32 v17, v7 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_11: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v6 +; GFX942-NEXT: v_mov_b32_e32 v17, v7 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -1281,36 +1212,37 @@ define void @v_shuffle_v2i64_v8i64__15_12(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v14 -; GFX900-NEXT: v_mov_b32_e32 v7, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, v8 +; GFX900-NEXT: v_mov_b32_e32 v17, v9 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_12: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v14 -; GFX90A-NEXT: v_mov_b32_e32 v7, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v16, v8 +; GFX90A-NEXT: v_mov_b32_e32 v17, v9 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_12: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v14 -; GFX942-NEXT: v_mov_b32_e32 v7, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v8 +; GFX942-NEXT: v_mov_b32_e32 v17, v9 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -1327,36 +1259,37 @@ define void @v_shuffle_v2i64_v8i64__15_13(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v14 -; GFX900-NEXT: v_mov_b32_e32 v9, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, v10 +; GFX900-NEXT: v_mov_b32_e32 v17, v11 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_13: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v14 -; GFX90A-NEXT: v_mov_b32_e32 v9, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v16, v10 +; GFX90A-NEXT: v_mov_b32_e32 v17, v11 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_13: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v14 -; GFX942-NEXT: v_mov_b32_e32 v9, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v10 +; GFX942-NEXT: v_mov_b32_e32 v17, v11 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -1373,36 +1306,37 @@ define void @v_shuffle_v2i64_v8i64__15_14(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v14 -; GFX900-NEXT: v_mov_b32_e32 v11, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, v12 +; GFX900-NEXT: v_mov_b32_e32 v17, v13 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_14: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v14 -; GFX90A-NEXT: v_mov_b32_e32 v11, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v16, v12 +; GFX90A-NEXT: v_mov_b32_e32 v17, v13 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_14: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v14 -; GFX942-NEXT: v_mov_b32_e32 v11, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v12 +; GFX942-NEXT: v_mov_b32_e32 v17, v13 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -1419,36 +1353,37 @@ define void @v_shuffle_v2i64_v8i64__15_15(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, v14 +; GFX900-NEXT: v_mov_b32_e32 v17, v15 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_15: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v16, v14 +; GFX90A-NEXT: v_mov_b32_e32 v17, v15 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_15: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v14 +; GFX942-NEXT: v_mov_b32_e32 v17, v15 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -1462,12 +1397,10 @@ define void @v_shuffle_v2i64_v8i64__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1475,26 +1408,22 @@ define void @v_shuffle_v2i64_v8i64__u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ; def v[2:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ; def v[2:17] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -1825,42 +1754,37 @@ define void @v_shuffle_v2i64_v8i64__7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v14 -; GFX900-NEXT: v_mov_b32_e32 v3, v15 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v16, v0 +; GFX900-NEXT: v_mov_b32_e32 v17, v1 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__7_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v14 -; GFX90A-NEXT: v_mov_b32_e32 v3, v15 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v16, v0 +; GFX90A-NEXT: v_mov_b32_e32 v17, v1 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__7_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v14 -; GFX942-NEXT: v_mov_b32_e32 v3, v15 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v0 +; GFX942-NEXT: v_mov_b32_e32 v17, v1 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -1873,12 +1797,10 @@ define void @v_shuffle_v2i64_v8i64__8_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1886,26 +1808,22 @@ define void @v_shuffle_v2i64_v8i64__8_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__8_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ; def v[2:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__8_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ; def v[2:17] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -2605,36 +2523,37 @@ define void @v_shuffle_v2i64_v8i64__7_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v14 -; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, v2 +; GFX900-NEXT: v_mov_b32_e32 v17, v3 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__7_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v14 -; GFX90A-NEXT: v_mov_b32_e32 v1, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v16, v2 +; GFX90A-NEXT: v_mov_b32_e32 v17, v3 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__7_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v14 -; GFX942-NEXT: v_mov_b32_e32 v1, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v2 +; GFX942-NEXT: v_mov_b32_e32 v17, v3 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -3373,36 +3292,37 @@ define void @v_shuffle_v2i64_v8i64__7_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v14 -; GFX900-NEXT: v_mov_b32_e32 v3, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, v4 +; GFX900-NEXT: v_mov_b32_e32 v17, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__7_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v14 -; GFX90A-NEXT: v_mov_b32_e32 v3, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v16, v4 +; GFX90A-NEXT: v_mov_b32_e32 v17, v5 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__7_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v14 -; GFX942-NEXT: v_mov_b32_e32 v3, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v4 +; GFX942-NEXT: v_mov_b32_e32 v17, v5 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -4141,36 +4061,37 @@ define void @v_shuffle_v2i64_v8i64__7_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, v6 +; GFX900-NEXT: v_mov_b32_e32 v17, v7 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__7_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v16, v6 +; GFX90A-NEXT: v_mov_b32_e32 v17, v7 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__7_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v6 +; GFX942-NEXT: v_mov_b32_e32 v17, v7 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -4909,36 +4830,37 @@ define void @v_shuffle_v2i64_v8i64__7_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v14 -; GFX900-NEXT: v_mov_b32_e32 v7, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, v8 +; GFX900-NEXT: v_mov_b32_e32 v17, v9 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__7_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v14 -; GFX90A-NEXT: v_mov_b32_e32 v7, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v16, v8 +; GFX90A-NEXT: v_mov_b32_e32 v17, v9 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__7_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v14 -; GFX942-NEXT: v_mov_b32_e32 v7, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v8 +; GFX942-NEXT: v_mov_b32_e32 v17, v9 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -5677,36 +5599,37 @@ define void @v_shuffle_v2i64_v8i64__7_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v14 -; GFX900-NEXT: v_mov_b32_e32 v9, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, v10 +; GFX900-NEXT: v_mov_b32_e32 v17, v11 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v14 -; GFX90A-NEXT: v_mov_b32_e32 v9, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v16, v10 +; GFX90A-NEXT: v_mov_b32_e32 v17, v11 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v14 -; GFX942-NEXT: v_mov_b32_e32 v9, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v10 +; GFX942-NEXT: v_mov_b32_e32 v17, v11 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -6445,36 +6368,37 @@ define void @v_shuffle_v2i64_v8i64__7_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v14 -; GFX900-NEXT: v_mov_b32_e32 v11, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, v12 +; GFX900-NEXT: v_mov_b32_e32 v17, v13 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__7_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v14 -; GFX90A-NEXT: v_mov_b32_e32 v11, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v16, v12 +; GFX90A-NEXT: v_mov_b32_e32 v17, v13 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__7_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v14 -; GFX942-NEXT: v_mov_b32_e32 v11, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v12 +; GFX942-NEXT: v_mov_b32_e32 v17, v13 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -7213,36 +7137,37 @@ define void @v_shuffle_v2i64_v8i64__7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, v14 +; GFX900-NEXT: v_mov_b32_e32 v17, v15 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v16, v14 +; GFX90A-NEXT: v_mov_b32_e32 v17, v15 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v14 +; GFX942-NEXT: v_mov_b32_e32 v17, v15 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -7914,39 +7839,33 @@ define void @v_shuffle_v2i64_v8i64__7_8(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_8: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v14 -; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v8i64__7_8: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v14 -; GFX90A-NEXT: v_mov_b32_e32 v1, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v8i64__7_8: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v14 -; GFX942-NEXT: v_mov_b32_e32 v1, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -8679,15 +8598,15 @@ define void @v_shuffle_v2i64_v8i64__7_9(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, v2 +; GFX900-NEXT: v_mov_b32_e32 v17, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v16 -; GFX900-NEXT: v_mov_b32_e32 v1, v17 -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8695,15 +8614,15 @@ define void @v_shuffle_v2i64_v8i64__7_9(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:17] +; GFX90A-NEXT: ; def v[16:31] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, v18 +; GFX90A-NEXT: v_mov_b32_e32 v17, v19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v18, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v16 -; GFX90A-NEXT: v_mov_b32_e32 v1, v17 -; GFX90A-NEXT: global_store_dwordx4 v18, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v32, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8711,16 +8630,15 @@ define void @v_shuffle_v2i64_v8i64__7_9(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:17] +; GFX942-NEXT: ; def v[16:31] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v18 +; GFX942-NEXT: v_mov_b32_e32 v17, v19 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v16 -; GFX942-NEXT: v_mov_b32_e32 v1, v17 -; GFX942-NEXT: global_store_dwordx4 v18, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v32, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -9483,15 +9401,15 @@ define void @v_shuffle_v2i64_v8i64__7_10(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, v4 +; GFX900-NEXT: v_mov_b32_e32 v17, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v16 -; GFX900-NEXT: v_mov_b32_e32 v3, v17 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9499,15 +9417,15 @@ define void @v_shuffle_v2i64_v8i64__7_10(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:17] +; GFX90A-NEXT: ; def v[16:31] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, v20 +; GFX90A-NEXT: v_mov_b32_e32 v17, v21 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v18, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v16 -; GFX90A-NEXT: v_mov_b32_e32 v3, v17 -; GFX90A-NEXT: global_store_dwordx4 v18, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v32, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9515,16 +9433,15 @@ define void @v_shuffle_v2i64_v8i64__7_10(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:17] +; GFX942-NEXT: ; def v[16:31] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v20 +; GFX942-NEXT: v_mov_b32_e32 v17, v21 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v16 -; GFX942-NEXT: v_mov_b32_e32 v3, v17 -; GFX942-NEXT: global_store_dwordx4 v18, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v32, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -10287,15 +10204,15 @@ define void @v_shuffle_v2i64_v8i64__7_11(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, v6 +; GFX900-NEXT: v_mov_b32_e32 v17, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v16 -; GFX900-NEXT: v_mov_b32_e32 v5, v17 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10303,15 +10220,15 @@ define void @v_shuffle_v2i64_v8i64__7_11(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:17] +; GFX90A-NEXT: ; def v[16:31] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, v22 +; GFX90A-NEXT: v_mov_b32_e32 v17, v23 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v18, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v16 -; GFX90A-NEXT: v_mov_b32_e32 v5, v17 -; GFX90A-NEXT: global_store_dwordx4 v18, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v32, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10319,16 +10236,15 @@ define void @v_shuffle_v2i64_v8i64__7_11(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:17] +; GFX942-NEXT: ; def v[16:31] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v22 +; GFX942-NEXT: v_mov_b32_e32 v17, v23 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v16 -; GFX942-NEXT: v_mov_b32_e32 v5, v17 -; GFX942-NEXT: global_store_dwordx4 v18, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v32, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -11091,15 +11007,15 @@ define void @v_shuffle_v2i64_v8i64__7_12(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, v8 +; GFX900-NEXT: v_mov_b32_e32 v17, v9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v16 -; GFX900-NEXT: v_mov_b32_e32 v7, v17 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11107,15 +11023,15 @@ define void @v_shuffle_v2i64_v8i64__7_12(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:17] +; GFX90A-NEXT: ; def v[16:31] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, v24 +; GFX90A-NEXT: v_mov_b32_e32 v17, v25 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v18, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v16 -; GFX90A-NEXT: v_mov_b32_e32 v7, v17 -; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v32, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11123,16 +11039,15 @@ define void @v_shuffle_v2i64_v8i64__7_12(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:17] +; GFX942-NEXT: ; def v[16:31] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v24 +; GFX942-NEXT: v_mov_b32_e32 v17, v25 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v16 -; GFX942-NEXT: v_mov_b32_e32 v7, v17 -; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v32, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -11895,15 +11810,15 @@ define void @v_shuffle_v2i64_v8i64__7_13(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, v10 +; GFX900-NEXT: v_mov_b32_e32 v17, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v16 -; GFX900-NEXT: v_mov_b32_e32 v9, v17 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11911,15 +11826,15 @@ define void @v_shuffle_v2i64_v8i64__7_13(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:17] +; GFX90A-NEXT: ; def v[16:31] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, v26 +; GFX90A-NEXT: v_mov_b32_e32 v17, v27 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v18, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v16 -; GFX90A-NEXT: v_mov_b32_e32 v9, v17 -; GFX90A-NEXT: global_store_dwordx4 v18, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v32, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11927,16 +11842,15 @@ define void @v_shuffle_v2i64_v8i64__7_13(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:17] +; GFX942-NEXT: ; def v[16:31] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v26 +; GFX942-NEXT: v_mov_b32_e32 v17, v27 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v16 -; GFX942-NEXT: v_mov_b32_e32 v9, v17 -; GFX942-NEXT: global_store_dwordx4 v18, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v32, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -12699,15 +12613,15 @@ define void @v_shuffle_v2i64_v8i64__7_14(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, v12 +; GFX900-NEXT: v_mov_b32_e32 v17, v13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v16 -; GFX900-NEXT: v_mov_b32_e32 v11, v17 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12715,15 +12629,15 @@ define void @v_shuffle_v2i64_v8i64__7_14(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:17] +; GFX90A-NEXT: ; def v[16:31] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, v28 +; GFX90A-NEXT: v_mov_b32_e32 v17, v29 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v18, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v16 -; GFX90A-NEXT: v_mov_b32_e32 v11, v17 -; GFX90A-NEXT: global_store_dwordx4 v18, v[10:13], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v32, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12731,16 +12645,15 @@ define void @v_shuffle_v2i64_v8i64__7_14(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:17] +; GFX942-NEXT: ; def v[16:31] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v28 +; GFX942-NEXT: v_mov_b32_e32 v17, v29 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v16 -; GFX942-NEXT: v_mov_b32_e32 v11, v17 -; GFX942-NEXT: global_store_dwordx4 v18, v[10:13], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v32, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -13503,15 +13416,15 @@ define void @v_shuffle_v2i64_v8i64__7_15(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, v14 +; GFX900-NEXT: v_mov_b32_e32 v17, v15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v16 -; GFX900-NEXT: v_mov_b32_e32 v13, v17 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -13519,15 +13432,15 @@ define void @v_shuffle_v2i64_v8i64__7_15(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:17] +; GFX90A-NEXT: ; def v[16:31] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, v30 +; GFX90A-NEXT: v_mov_b32_e32 v17, v31 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v18, 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, v16 -; GFX90A-NEXT: v_mov_b32_e32 v13, v17 -; GFX90A-NEXT: global_store_dwordx4 v18, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v32, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -13535,16 +13448,15 @@ define void @v_shuffle_v2i64_v8i64__7_15(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:17] +; GFX942-NEXT: ; def v[16:31] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v30 +; GFX942-NEXT: v_mov_b32_e32 v17, v31 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v12, v16 -; GFX942-NEXT: v_mov_b32_e32 v13, v17 -; GFX942-NEXT: global_store_dwordx4 v18, v[12:15], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v32, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll index 2ecbf9622a259..411d8b735b9b6 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll @@ -58,39 +58,33 @@ define void @v_shuffle_v2p0_v2p0__1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v2p0__1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v2p0__1_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v2p0__1_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -114,39 +108,33 @@ define void @v_shuffle_v2p0_v2p0__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v2p0__3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v2p0__3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v2p0__3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -160,55 +148,42 @@ define void @v_shuffle_v2p0_v2p0__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v2p0__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v2p0__3_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v2p0__3_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -222,49 +197,43 @@ define void @v_shuffle_v2p0_v2p0__3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v2p0__3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v2p0__3_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v2p0__3_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -291,31 +260,27 @@ define void @v_shuffle_v2p0_v2p0__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2p0_v2p0__3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v2p0__3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -329,39 +294,40 @@ define void @v_shuffle_v2p0_v2p0__3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v2p0__3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v2p0__3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v2p0__3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -375,39 +341,33 @@ define void @v_shuffle_v2p0_v2p0__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v2p0__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v2p0__u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v2p0__u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -478,31 +438,27 @@ define void @v_shuffle_v2p0_v2p0__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2p0_v2p0__1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v2p0__1_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -515,39 +471,33 @@ define void @v_shuffle_v2p0_v2p0__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v2p0__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v2p0__2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v2p0__2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -638,39 +588,40 @@ define void @v_shuffle_v2p0_v2p0__1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v2p0__1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v2p0__1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v2p0__1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -772,39 +723,33 @@ define void @v_shuffle_v2p0_v2p0__1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v2p0__1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v2p0__1_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v2p0__1_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -928,12 +873,12 @@ define void @v_shuffle_v2p0_v2p0__1_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -944,12 +889,12 @@ define void @v_shuffle_v2p0_v2p0__1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -960,13 +905,12 @@ define void @v_shuffle_v2p0_v2p0__1_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v3p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v3p0.ll index 27a6cf11c4cb1..385dc73531d14 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v3p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v3p0.ll @@ -97,39 +97,33 @@ define void @v_shuffle_v2p0_v3p0__2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v3p0__2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v3p0__2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v3p0__2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -193,39 +187,33 @@ define void @v_shuffle_v2p0_v3p0__5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v3p0__5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v3p0__5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v3p0__5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -239,55 +227,42 @@ define void @v_shuffle_v2p0_v3p0__5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v3p0__5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v3p0__5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v3p0__5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -301,49 +276,43 @@ define void @v_shuffle_v2p0_v3p0__5_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v3p0__5_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v3p0__5_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v3p0__5_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -357,49 +326,43 @@ define void @v_shuffle_v2p0_v3p0__5_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v3p0__5_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v3p0__5_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v3p0__5_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -413,45 +376,40 @@ define void @v_shuffle_v2p0_v3p0__5_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v3p0__5_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v3p0__5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v3p0__5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -465,39 +423,40 @@ define void @v_shuffle_v2p0_v3p0__5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v3p0__5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v3p0__5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v3p0__5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -511,39 +470,40 @@ define void @v_shuffle_v2p0_v3p0__5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v3p0__5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v3p0__5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v3p0__5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -557,39 +517,33 @@ define void @v_shuffle_v2p0_v3p0__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v3p0__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v3p0__u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v3p0__u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -692,45 +646,40 @@ define void @v_shuffle_v2p0_v3p0__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v3p0__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v3p0__2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v3p0__2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -743,39 +692,33 @@ define void @v_shuffle_v2p0_v3p0__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v3p0__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v3p0__3_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v3p0__3_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -967,39 +910,40 @@ define void @v_shuffle_v2p0_v3p0__2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v3p0__2_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v3p0__2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v3p0__2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1230,39 +1174,40 @@ define void @v_shuffle_v2p0_v3p0__2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v3p0__2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v3p0__2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v3p0__2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1459,39 +1404,33 @@ define void @v_shuffle_v2p0_v3p0__2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v3p0__2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v3p0__2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v3p0__2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1717,12 +1656,12 @@ define void @v_shuffle_v2p0_v3p0__2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1733,12 +1672,12 @@ define void @v_shuffle_v2p0_v3p0__2_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1749,13 +1688,12 @@ define void @v_shuffle_v2p0_v3p0__2_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v8 +; GFX942-NEXT: v_mov_b32_e32 v7, v9 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -2011,12 +1949,12 @@ define void @v_shuffle_v2p0_v3p0__2_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2027,12 +1965,12 @@ define void @v_shuffle_v2p0_v3p0__2_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2043,13 +1981,12 @@ define void @v_shuffle_v2p0_v3p0__2_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v10 +; GFX942-NEXT: v_mov_b32_e32 v7, v11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -2271,10 +2208,9 @@ define void @s_shuffle_v2p0_v3p0__2_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND @@ -2372,10 +2308,9 @@ define void @s_shuffle_v2p0_v3p0__5_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND @@ -2392,15 +2327,13 @@ define void @s_shuffle_v2p0_v3p0__5_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -2410,15 +2343,13 @@ define void @s_shuffle_v2p0_v3p0__5_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -2481,11 +2412,11 @@ define void @s_shuffle_v2p0_v3p0__5_1() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND @@ -2536,13 +2467,11 @@ define void @s_shuffle_v2p0_v3p0__5_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND @@ -2555,46 +2484,18 @@ define void @s_shuffle_v2p0_v3p0__5_2() { } define void @s_shuffle_v2p0_v3p0__5_3() { -; GFX900-LABEL: s_shuffle_v2p0_v3p0__5_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v2p0_v3p0__5_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v2p0_v3p0__5_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v2p0_v3p0__5_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> @@ -2607,10 +2508,10 @@ define void @s_shuffle_v2p0_v3p0__5_4() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:11] ; GFX9-NEXT: ;;#ASMEND @@ -2623,50 +2524,18 @@ define void @s_shuffle_v2p0_v3p0__5_4() { } define void @s_shuffle_v2p0_v3p0__5_5() { -; GFX900-LABEL: s_shuffle_v2p0_v3p0__5_5: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v2p0_v3p0__5_5: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v2p0_v3p0__5_5: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v2p0_v3p0__5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> @@ -2790,46 +2659,18 @@ define void @s_shuffle_v2p0_v3p0__1_0() { } define void @s_shuffle_v2p0_v3p0__2_0() { -; GFX900-LABEL: s_shuffle_v2p0_v3p0__2_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v2p0_v3p0__2_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v2p0_v3p0__2_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v2p0_v3p0__2_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf) @@ -3046,10 +2887,10 @@ define void @s_shuffle_v2p0_v3p0__2_1() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:11] ; GFX9-NEXT: ;;#ASMEND @@ -3271,50 +3112,18 @@ define void @s_shuffle_v2p0_v3p0__1_2() { } define void @s_shuffle_v2p0_v3p0__2_2() { -; GFX900-LABEL: s_shuffle_v2p0_v3p0__2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v2p0_v3p0__2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v2p0_v3p0__2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v2p0_v3p0__2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf) @@ -3553,10 +3362,9 @@ define void @s_shuffle_v2p0_v3p0__2_3() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND @@ -3789,13 +3597,13 @@ define void @s_shuffle_v2p0_v3p0__2_4() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -3805,13 +3613,13 @@ define void @s_shuffle_v2p0_v3p0__2_4() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -3821,13 +3629,14 @@ define void @s_shuffle_v2p0_v3p0__2_4() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_mov_b32 s10, s2 +; GFX942-NEXT: s_mov_b32 s11, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND @@ -4099,15 +3908,14 @@ define void @s_shuffle_v2p0_v3p0__2_5() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll index ae31524ebaa7f..70d72571b9897 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll @@ -136,39 +136,33 @@ define void @v_shuffle_v2p0_v4p0__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v4p0__3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v4p0__3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v4p0__3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -272,39 +266,33 @@ define void @v_shuffle_v2p0_v4p0__7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v4p0__7_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v4p0__7_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v4p0__7_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -318,55 +306,42 @@ define void @v_shuffle_v2p0_v4p0__7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v4p0__7_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v4p0__7_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v4p0__7_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -380,49 +355,43 @@ define void @v_shuffle_v2p0_v4p0__7_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v4p0__7_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v4p0__7_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v4p0__7_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -436,49 +405,43 @@ define void @v_shuffle_v2p0_v4p0__7_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v4p0__7_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v4p0__7_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v4p0__7_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -492,49 +455,43 @@ define void @v_shuffle_v2p0_v4p0__7_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v4p0__7_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v4p0__7_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v4p0__7_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -548,45 +505,40 @@ define void @v_shuffle_v2p0_v4p0__7_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v4p0__7_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v4p0__7_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v4p0__7_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -600,39 +552,40 @@ define void @v_shuffle_v2p0_v4p0__7_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v4p0__7_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v4p0__7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v4p0__7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -646,39 +599,40 @@ define void @v_shuffle_v2p0_v4p0__7_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v4p0__7_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v4p0__7_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v4p0__7_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -692,39 +646,40 @@ define void @v_shuffle_v2p0_v4p0__7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v4p0__7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v4p0__7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v4p0__7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -738,39 +693,33 @@ define void @v_shuffle_v2p0_v4p0__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v4p0__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v4p0__u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v4p0__u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -918,45 +867,40 @@ define void @v_shuffle_v2p0_v4p0__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v4p0__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v4p0__3_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v4p0__3_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -969,39 +913,33 @@ define void @v_shuffle_v2p0_v4p0__4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v4p0__4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v4p0__4_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v4p0__4_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1294,39 +1232,40 @@ define void @v_shuffle_v2p0_v4p0__3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v4p0__3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v4p0__3_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v4p0__3_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1658,39 +1597,40 @@ define void @v_shuffle_v2p0_v4p0__3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v4p0__3_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v4p0__3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v4p0__3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2022,39 +1962,40 @@ define void @v_shuffle_v2p0_v4p0__3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v4p0__3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v4p0__3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v4p0__3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2346,39 +2287,33 @@ define void @v_shuffle_v2p0_v4p0__3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v4p0__3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v4p0__3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v4p0__3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2706,12 +2641,12 @@ define void @v_shuffle_v2p0_v4p0__3_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2722,12 +2657,12 @@ define void @v_shuffle_v2p0_v4p0__3_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v6 -; GFX90A-NEXT: v_mov_b32_e32 v9, v7 -; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2738,13 +2673,12 @@ define void @v_shuffle_v2p0_v4p0__3_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v10 +; GFX942-NEXT: v_mov_b32_e32 v9, v11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v6 -; GFX942-NEXT: v_mov_b32_e32 v9, v7 -; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -3102,12 +3036,12 @@ define void @v_shuffle_v2p0_v4p0__3_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v6 -; GFX900-NEXT: v_mov_b32_e32 v11, v7 -; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3118,12 +3052,12 @@ define void @v_shuffle_v2p0_v4p0__3_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v12 +; GFX90A-NEXT: v_mov_b32_e32 v9, v13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, v6 -; GFX90A-NEXT: v_mov_b32_e32 v11, v7 -; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3134,13 +3068,12 @@ define void @v_shuffle_v2p0_v4p0__3_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v12 +; GFX942-NEXT: v_mov_b32_e32 v9, v13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v6 -; GFX942-NEXT: v_mov_b32_e32 v11, v7 -; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -3498,12 +3431,12 @@ define void @v_shuffle_v2p0_v4p0__3_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v14 +; GFX900-NEXT: v_mov_b32_e32 v9, v15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v6 -; GFX900-NEXT: v_mov_b32_e32 v13, v7 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3514,12 +3447,12 @@ define void @v_shuffle_v2p0_v4p0__3_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v14 +; GFX90A-NEXT: v_mov_b32_e32 v9, v15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, v6 -; GFX90A-NEXT: v_mov_b32_e32 v13, v7 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3530,13 +3463,12 @@ define void @v_shuffle_v2p0_v4p0__3_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v14 +; GFX942-NEXT: v_mov_b32_e32 v9, v15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v12, v6 -; GFX942-NEXT: v_mov_b32_e32 v13, v7 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll index a9085502c7358..c35361721e9b0 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll @@ -138,12 +138,11 @@ define void @v_shuffle_v2p3_v4p3__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v4p3__3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -273,12 +272,11 @@ define void @v_shuffle_v2p3_v4p3__7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v4p3__7_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -316,16 +314,14 @@ define void @v_shuffle_v2p3_v4p3__7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v4p3__7_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v5, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -370,15 +366,14 @@ define void @v_shuffle_v2p3_v4p3__7_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v4p3__7_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v7, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -423,15 +418,14 @@ define void @v_shuffle_v2p3_v4p3__7_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v4p3__7_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: global_store_dwordx2 v7, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -476,15 +470,14 @@ define void @v_shuffle_v2p3_v4p3__7_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v4p3__7_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -529,13 +522,12 @@ define void @v_shuffle_v2p3_v4p3__7_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v4p3__7_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -575,12 +567,12 @@ define void @v_shuffle_v2p3_v4p3__7_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v4p3__7_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -618,12 +610,12 @@ define void @v_shuffle_v2p3_v4p3__7_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v4p3__7_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -661,12 +653,12 @@ define void @v_shuffle_v2p3_v4p3__7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v4p3__7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -704,12 +696,11 @@ define void @v_shuffle_v2p3_v4p3__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v4p3__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -872,13 +863,12 @@ define void @v_shuffle_v2p3_v4p3__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v4p3__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -917,12 +907,11 @@ define void @v_shuffle_v2p3_v4p3__4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v4p3__4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1227,12 +1216,12 @@ define void @v_shuffle_v2p3_v4p3__3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v4p3__3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1582,12 +1571,12 @@ define void @v_shuffle_v2p3_v4p3__3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v4p3__3_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1933,12 +1922,12 @@ define void @v_shuffle_v2p3_v4p3__3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v4p3__3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2250,12 +2239,11 @@ define void @v_shuffle_v2p3_v4p3__3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v4p3__3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2592,11 +2580,11 @@ define void @v_shuffle_v2p3_v4p3__3_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2972,11 +2960,11 @@ define void @v_shuffle_v2p3_v4p3__3_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3354,11 +3342,11 @@ define void @v_shuffle_v2p3_v4p3__3_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll index 9174e92cd9c82..86ec2bc9816bc 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll @@ -298,12 +298,11 @@ define void @v_shuffle_v2p3_v8p3__7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__7_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -597,12 +596,11 @@ define void @v_shuffle_v2p3_v8p3__15_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -640,16 +638,14 @@ define void @v_shuffle_v2p3_v8p3__15_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v9, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v8 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v9, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -694,15 +690,14 @@ define void @v_shuffle_v2p3_v8p3__15_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v15, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[7:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v9 -; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v15, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -747,15 +742,14 @@ define void @v_shuffle_v2p3_v8p3__15_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v11, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v10 -; GFX900-NEXT: global_store_dwordx2 v11, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v14, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -800,15 +794,14 @@ define void @v_shuffle_v2p3_v8p3__15_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v13, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[5:12] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v13, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -853,15 +846,14 @@ define void @v_shuffle_v2p3_v8p3__15_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v13, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v12 -; GFX900-NEXT: global_store_dwordx2 v13, v[3:4], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -906,15 +898,14 @@ define void @v_shuffle_v2p3_v8p3__15_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v11, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[3:10] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v13 -; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v11, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -959,15 +950,14 @@ define void @v_shuffle_v2p3_v8p3__15_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v15, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, v14 -; GFX900-NEXT: global_store_dwordx2 v15, v[5:6], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1012,15 +1002,14 @@ define void @v_shuffle_v2p3_v8p3__15_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[1:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v15 -; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1065,13 +1054,12 @@ define void @v_shuffle_v2p3_v8p3__15_8(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_8: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1111,12 +1099,12 @@ define void @v_shuffle_v2p3_v8p3__15_9(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_9: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v1 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1154,12 +1142,12 @@ define void @v_shuffle_v2p3_v8p3__15_10(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_10: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1199,12 +1187,12 @@ define void @v_shuffle_v2p3_v8p3__15_11(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_11: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v3 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1242,12 +1230,12 @@ define void @v_shuffle_v2p3_v8p3__15_12(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_12: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1287,12 +1275,12 @@ define void @v_shuffle_v2p3_v8p3__15_13(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_13: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v5 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1330,12 +1318,12 @@ define void @v_shuffle_v2p3_v8p3__15_14(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_14: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1373,12 +1361,12 @@ define void @v_shuffle_v2p3_v8p3__15_15(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_15: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v7 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1416,12 +1404,11 @@ define void @v_shuffle_v2p3_v8p3__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[1:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1756,13 +1743,12 @@ define void @v_shuffle_v2p3_v8p3__7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__7_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1801,12 +1787,11 @@ define void @v_shuffle_v2p3_v8p3__8_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__8_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[1:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2491,12 +2476,12 @@ define void @v_shuffle_v2p3_v8p3__7_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__7_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v1 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3228,12 +3213,12 @@ define void @v_shuffle_v2p3_v8p3__7_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__7_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3961,12 +3946,12 @@ define void @v_shuffle_v2p3_v8p3__7_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__7_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v3 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4698,12 +4683,12 @@ define void @v_shuffle_v2p3_v8p3__7_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__7_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5431,12 +5416,12 @@ define void @v_shuffle_v2p3_v8p3__7_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__7_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v5 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6170,12 +6155,12 @@ define void @v_shuffle_v2p3_v8p3__7_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__7_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6901,12 +6886,12 @@ define void @v_shuffle_v2p3_v8p3__7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v7 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7590,12 +7575,11 @@ define void @v_shuffle_v2p3_v8p3__7_8(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v8p3__7_8: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8320,11 +8304,11 @@ define void @v_shuffle_v2p3_v8p3__7_9(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v7 -; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9084,11 +9068,11 @@ define void @v_shuffle_v2p3_v8p3__7_10(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: global_store_dwordx2 v16, v[9:10], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9852,11 +9836,11 @@ define void @v_shuffle_v2p3_v8p3__7_11(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v7 -; GFX900-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10616,11 +10600,11 @@ define void @v_shuffle_v2p3_v8p3__7_12(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v12 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v11, v7 -; GFX900-NEXT: global_store_dwordx2 v16, v[11:12], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11384,11 +11368,11 @@ define void @v_shuffle_v2p3_v8p3__7_13(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v7 -; GFX900-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12148,11 +12132,11 @@ define void @v_shuffle_v2p3_v8p3__7_14(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v14 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v13, v7 -; GFX900-NEXT: global_store_dwordx2 v16, v[13:14], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12918,11 +12902,11 @@ define void @v_shuffle_v2p3_v8p3__7_15(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, v7 -; GFX900-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[7:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll index 16202a708fd5c..d38b17c04947b 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll @@ -776,15 +776,14 @@ define void @v_shuffle_v3f32_v2f32__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll index 131204c8a6430..4032d31cbb041 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll @@ -99,36 +99,33 @@ define void @v_shuffle_v3f32_v3f32__2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__2_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v3, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -194,36 +191,33 @@ define void @v_shuffle_v3f32_v3f32__5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v3, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -237,48 +231,45 @@ define void @v_shuffle_v3f32_v3f32__5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_0_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_0_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -292,46 +283,43 @@ define void @v_shuffle_v3f32_v3f32__5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_1_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_1_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -345,15 +333,14 @@ define void @v_shuffle_v3f32_v3f32__5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -361,15 +348,14 @@ define void @v_shuffle_v3f32_v3f32__5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -377,15 +363,14 @@ define void @v_shuffle_v3f32_v3f32__5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -401,37 +386,35 @@ define void @v_shuffle_v3f32_v3f32__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -445,36 +428,37 @@ define void @v_shuffle_v3f32_v3f32__5_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -488,38 +472,37 @@ define void @v_shuffle_v3f32_v3f32__5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -533,50 +516,46 @@ define void @v_shuffle_v3f32_v3f32__5_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -591,14 +570,14 @@ define void @v_shuffle_v3f32_v3f32__5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -606,16 +585,15 @@ define void @v_shuffle_v3f32_v3f32__5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -623,17 +601,15 @@ define void @v_shuffle_v3f32_v3f32__5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -648,14 +624,14 @@ define void @v_shuffle_v3f32_v3f32__5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -663,15 +639,14 @@ define void @v_shuffle_v3f32_v3f32__5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -679,16 +654,15 @@ define void @v_shuffle_v3f32_v3f32__5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -702,42 +676,40 @@ define void @v_shuffle_v3f32_v3f32__5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_5_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -751,42 +723,40 @@ define void @v_shuffle_v3f32_v3f32__5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -800,39 +770,40 @@ define void @v_shuffle_v3f32_v3f32__5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -858,26 +829,25 @@ define void @v_shuffle_v3f32_v3f32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -948,29 +918,27 @@ define void @v_shuffle_v3f32_v3f32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -983,42 +951,40 @@ define void @v_shuffle_v3f32_v3f32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__2_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1043,26 +1009,25 @@ define void @v_shuffle_v3f32_v3f32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__3_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__3_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1091,16 +1056,15 @@ define void @v_shuffle_v3f32_v3f32__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1108,17 +1072,15 @@ define void @v_shuffle_v3f32_v3f32__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx3 v7, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1133,50 +1095,45 @@ define void @v_shuffle_v3f32_v3f32__5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1190,49 +1147,42 @@ define void @v_shuffle_v3f32_v3f32__5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v3, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1246,52 +1196,46 @@ define void @v_shuffle_v3f32_v3f32__5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_1_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1305,50 +1249,46 @@ define void @v_shuffle_v3f32_v3f32__5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v6 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1362,50 +1302,46 @@ define void @v_shuffle_v3f32_v3f32__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_3_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1419,49 +1355,46 @@ define void @v_shuffle_v3f32_v3f32__5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_4_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_4_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1604,39 +1537,40 @@ define void @v_shuffle_v3f32_v3f32__2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__2_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__2_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__2_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1747,15 +1681,14 @@ define void @v_shuffle_v3f32_v3f32__5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1763,15 +1696,14 @@ define void @v_shuffle_v3f32_v3f32__5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1779,16 +1711,14 @@ define void @v_shuffle_v3f32_v3f32__5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1802,15 +1732,14 @@ define void @v_shuffle_v3f32_v3f32__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1818,15 +1747,14 @@ define void @v_shuffle_v3f32_v3f32__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1834,16 +1762,14 @@ define void @v_shuffle_v3f32_v3f32__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1857,15 +1783,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1873,16 +1798,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1890,17 +1814,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1915,15 +1837,14 @@ define void @v_shuffle_v3f32_v3f32__5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1933,14 +1854,13 @@ define void @v_shuffle_v3f32_v3f32__5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1950,14 +1870,13 @@ define void @v_shuffle_v3f32_v3f32__5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1974,13 +1893,12 @@ define void @v_shuffle_v3f32_v3f32__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1988,16 +1906,15 @@ define void @v_shuffle_v3f32_v3f32__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2005,16 +1922,15 @@ define void @v_shuffle_v3f32_v3f32__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2029,15 +1945,14 @@ define void @v_shuffle_v3f32_v3f32__5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2045,15 +1960,15 @@ define void @v_shuffle_v3f32_v3f32__5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2061,16 +1976,15 @@ define void @v_shuffle_v3f32_v3f32__5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2212,39 +2126,40 @@ define void @v_shuffle_v3f32_v3f32__2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2353,16 +2268,15 @@ define void @v_shuffle_v3f32_v3f32__5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2370,15 +2284,14 @@ define void @v_shuffle_v3f32_v3f32__5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2386,15 +2299,14 @@ define void @v_shuffle_v3f32_v3f32__5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2408,46 +2320,43 @@ define void @v_shuffle_v3f32_v3f32__5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2462,15 +2371,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2480,14 +2388,13 @@ define void @v_shuffle_v3f32_v3f32__5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2497,14 +2404,13 @@ define void @v_shuffle_v3f32_v3f32__5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2518,46 +2424,43 @@ define void @v_shuffle_v3f32_v3f32__5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_1_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_1_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2574,13 +2477,12 @@ define void @v_shuffle_v3f32_v3f32__5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2588,15 +2490,14 @@ define void @v_shuffle_v3f32_v3f32__5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2604,16 +2505,15 @@ define void @v_shuffle_v3f32_v3f32__5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2628,15 +2528,14 @@ define void @v_shuffle_v3f32_v3f32__5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2644,15 +2543,14 @@ define void @v_shuffle_v3f32_v3f32__5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2660,15 +2558,15 @@ define void @v_shuffle_v3f32_v3f32__5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2773,36 +2671,33 @@ define void @v_shuffle_v3f32_v3f32__2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__2_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v3, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2839,29 +2734,27 @@ define void @v_shuffle_v3f32_v3f32__4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2875,42 +2768,40 @@ define void @v_shuffle_v3f32_v3f32__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2924,13 +2815,12 @@ define void @v_shuffle_v3f32_v3f32__5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2970,14 +2860,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2985,16 +2875,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3002,16 +2891,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3026,14 +2914,14 @@ define void @v_shuffle_v3f32_v3f32__5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3041,14 +2929,14 @@ define void @v_shuffle_v3f32_v3f32__5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3056,15 +2944,15 @@ define void @v_shuffle_v3f32_v3f32__5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3079,14 +2967,14 @@ define void @v_shuffle_v3f32_v3f32__5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3094,16 +2982,15 @@ define void @v_shuffle_v3f32_v3f32__5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3111,16 +2998,15 @@ define void @v_shuffle_v3f32_v3f32__5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3147,29 +3033,27 @@ define void @v_shuffle_v3f32_v3f32__5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_4_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3340,12 +3224,11 @@ define void @v_shuffle_v3f32_v3f32__2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v4 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3355,13 +3238,13 @@ define void @v_shuffle_v3f32_v3f32__2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3371,13 +3254,13 @@ define void @v_shuffle_v3f32_v3f32__2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3480,39 +3363,40 @@ define void @v_shuffle_v3f32_v3f32__5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3526,38 +3410,37 @@ define void @v_shuffle_v3f32_v3f32__5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v3, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3572,15 +3455,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3588,16 +3470,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3605,16 +3486,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3629,15 +3509,14 @@ define void @v_shuffle_v3f32_v3f32__5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3645,15 +3524,14 @@ define void @v_shuffle_v3f32_v3f32__5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3661,16 +3539,15 @@ define void @v_shuffle_v3f32_v3f32__5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3685,15 +3562,14 @@ define void @v_shuffle_v3f32_v3f32__5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3701,16 +3577,15 @@ define void @v_shuffle_v3f32_v3f32__5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3718,16 +3593,15 @@ define void @v_shuffle_v3f32_v3f32__5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3741,41 +3615,40 @@ define void @v_shuffle_v3f32_v3f32__5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3945,15 +3818,15 @@ define void @v_shuffle_v3f32_v3f32__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3963,13 +3836,13 @@ define void @v_shuffle_v3f32_v3f32__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3979,13 +3852,13 @@ define void @v_shuffle_v3f32_v3f32__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4087,36 +3960,37 @@ define void @v_shuffle_v3f32_v3f32__5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_u_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_u_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v3, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4131,15 +4005,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4147,15 +4020,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4163,16 +4036,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4187,15 +4059,14 @@ define void @v_shuffle_v3f32_v3f32__5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4203,15 +4074,14 @@ define void @v_shuffle_v3f32_v3f32__5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4219,16 +4089,15 @@ define void @v_shuffle_v3f32_v3f32__5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4243,15 +4112,14 @@ define void @v_shuffle_v3f32_v3f32__5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4259,15 +4127,15 @@ define void @v_shuffle_v3f32_v3f32__5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4275,15 +4143,15 @@ define void @v_shuffle_v3f32_v3f32__5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4297,42 +4165,40 @@ define void @v_shuffle_v3f32_v3f32__5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_3_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_3_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4346,36 +4212,40 @@ define void @v_shuffle_v3f32_v3f32__5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll index c5a08f098b4c6..2e4131d378906 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll @@ -99,36 +99,33 @@ define void @v_shuffle_v3f32_v4f32__2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__2_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -141,12 +138,11 @@ define void @v_shuffle_v3f32_v4f32__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -236,36 +232,33 @@ define void @v_shuffle_v3f32_v4f32__6_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__6_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__6_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__6_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -279,12 +272,11 @@ define void @v_shuffle_v3f32_v4f32__7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -322,16 +314,14 @@ define void @v_shuffle_v3f32_v4f32__7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -378,15 +368,14 @@ define void @v_shuffle_v3f32_v4f32__7_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -431,15 +420,14 @@ define void @v_shuffle_v3f32_v4f32__7_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -485,16 +473,14 @@ define void @v_shuffle_v3f32_v4f32__7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -508,9 +494,8 @@ define void @v_shuffle_v3f32_v4f32__7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -524,9 +509,9 @@ define void @v_shuffle_v3f32_v4f32__7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -540,13 +525,12 @@ define void @v_shuffle_v3f32_v4f32__7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -586,12 +570,12 @@ define void @v_shuffle_v3f32_v4f32__7_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -629,12 +613,12 @@ define void @v_shuffle_v3f32_v4f32__7_6_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_6_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -674,13 +658,12 @@ define void @v_shuffle_v3f32_v4f32__7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_7_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -691,9 +674,8 @@ define void @v_shuffle_v3f32_v4f32__7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -704,9 +686,8 @@ define void @v_shuffle_v3f32_v4f32__7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -720,17 +701,15 @@ define void @v_shuffle_v3f32_v4f32__7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_7_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v9, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -740,14 +719,12 @@ define void @v_shuffle_v3f32_v4f32__7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -757,15 +734,12 @@ define void @v_shuffle_v3f32_v4f32__7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -780,16 +754,14 @@ define void @v_shuffle_v3f32_v4f32__7_7_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -797,16 +769,15 @@ define void @v_shuffle_v3f32_v4f32__7_7_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -814,17 +785,15 @@ define void @v_shuffle_v3f32_v4f32__7_7_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -839,15 +808,14 @@ define void @v_shuffle_v3f32_v4f32__7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -855,15 +823,14 @@ define void @v_shuffle_v3f32_v4f32__7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -871,16 +838,15 @@ define void @v_shuffle_v3f32_v4f32__7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -895,15 +861,14 @@ define void @v_shuffle_v3f32_v4f32__7_7_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v7 -; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -911,16 +876,15 @@ define void @v_shuffle_v3f32_v4f32__7_7_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -928,16 +892,15 @@ define void @v_shuffle_v3f32_v4f32__7_7_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -951,43 +914,39 @@ define void @v_shuffle_v3f32_v4f32__7_7_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_7_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_7_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_7_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1001,42 +960,39 @@ define void @v_shuffle_v3f32_v4f32__7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_7_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1050,13 +1006,13 @@ define void @v_shuffle_v3f32_v4f32__7_7_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_7_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1096,13 +1052,13 @@ define void @v_shuffle_v3f32_v4f32__7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1112,11 +1068,10 @@ define void @v_shuffle_v3f32_v4f32__7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1126,11 +1081,10 @@ define void @v_shuffle_v3f32_v4f32__7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1145,38 +1099,36 @@ define void @v_shuffle_v3f32_v4f32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1247,29 +1199,27 @@ define void @v_shuffle_v3f32_v4f32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1285,40 +1235,36 @@ define void @v_shuffle_v3f32_v4f32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1331,43 +1277,40 @@ define void @v_shuffle_v3f32_v4f32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__3_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__3_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__3_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1381,38 +1324,36 @@ define void @v_shuffle_v3f32_v4f32__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__4_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__4_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1442,16 +1383,15 @@ define void @v_shuffle_v3f32_v4f32__5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1459,17 +1399,15 @@ define void @v_shuffle_v3f32_v4f32__5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1486,14 +1424,12 @@ define void @v_shuffle_v3f32_v4f32__6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1503,14 +1439,12 @@ define void @v_shuffle_v3f32_v4f32__6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1520,15 +1454,13 @@ define void @v_shuffle_v3f32_v4f32__6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1543,16 +1475,14 @@ define void @v_shuffle_v3f32_v4f32__7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1560,16 +1490,15 @@ define void @v_shuffle_v3f32_v4f32__7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1577,17 +1506,15 @@ define void @v_shuffle_v3f32_v4f32__7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1601,16 +1528,14 @@ define void @v_shuffle_v3f32_v4f32__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1618,15 +1543,14 @@ define void @v_shuffle_v3f32_v4f32__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: global_store_dwordx3 v1, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1634,16 +1558,15 @@ define void @v_shuffle_v3f32_v4f32__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: global_store_dwordx3 v1, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1657,17 +1580,15 @@ define void @v_shuffle_v3f32_v4f32__7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v9, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1675,16 +1596,15 @@ define void @v_shuffle_v3f32_v4f32__7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1692,17 +1612,15 @@ define void @v_shuffle_v3f32_v4f32__7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1716,50 +1634,48 @@ define void @v_shuffle_v3f32_v4f32__7_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v9, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v9 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx3 v10, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v7 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v9 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx3 v10, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1773,17 +1689,15 @@ define void @v_shuffle_v3f32_v4f32__7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v9, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1829,17 +1743,15 @@ define void @v_shuffle_v3f32_v4f32__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v9, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1847,16 +1759,15 @@ define void @v_shuffle_v3f32_v4f32__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1864,16 +1775,16 @@ define void @v_shuffle_v3f32_v4f32__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1887,16 +1798,15 @@ define void @v_shuffle_v3f32_v4f32__7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v9, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1943,16 +1853,15 @@ define void @v_shuffle_v3f32_v4f32__7_6_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_6_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v9, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1960,16 +1869,15 @@ define void @v_shuffle_v3f32_v4f32__7_6_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1977,17 +1885,16 @@ define void @v_shuffle_v3f32_v4f32__7_6_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2133,10 +2040,10 @@ define void @v_shuffle_v3f32_v4f32__2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2146,10 +2053,10 @@ define void @v_shuffle_v3f32_v4f32__2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2159,10 +2066,10 @@ define void @v_shuffle_v3f32_v4f32__2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2175,13 +2082,13 @@ define void @v_shuffle_v3f32_v4f32__3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__3_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2319,15 +2226,14 @@ define void @v_shuffle_v3f32_v4f32__6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2335,15 +2241,15 @@ define void @v_shuffle_v3f32_v4f32__6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2351,16 +2257,15 @@ define void @v_shuffle_v3f32_v4f32__6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2375,15 +2280,14 @@ define void @v_shuffle_v3f32_v4f32__7_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2430,16 +2334,14 @@ define void @v_shuffle_v3f32_v4f32__7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2486,17 +2388,14 @@ define void @v_shuffle_v3f32_v4f32__7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2546,16 +2445,14 @@ define void @v_shuffle_v3f32_v4f32__7_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2604,15 +2501,14 @@ define void @v_shuffle_v3f32_v4f32__7_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2659,16 +2555,14 @@ define void @v_shuffle_v3f32_v4f32__7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2717,15 +2611,14 @@ define void @v_shuffle_v3f32_v4f32__7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 ; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2773,15 +2666,14 @@ define void @v_shuffle_v3f32_v4f32__7_6_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2961,10 +2853,10 @@ define void @v_shuffle_v3f32_v4f32__2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2974,10 +2866,10 @@ define void @v_shuffle_v3f32_v4f32__2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2987,10 +2879,10 @@ define void @v_shuffle_v3f32_v4f32__2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -3003,13 +2895,13 @@ define void @v_shuffle_v3f32_v4f32__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3146,15 +3038,14 @@ define void @v_shuffle_v3f32_v4f32__6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3162,15 +3053,14 @@ define void @v_shuffle_v3f32_v4f32__6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3178,15 +3068,15 @@ define void @v_shuffle_v3f32_v4f32__6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -3201,15 +3091,14 @@ define void @v_shuffle_v3f32_v4f32__7_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3255,15 +3144,14 @@ define void @v_shuffle_v3f32_v4f32__7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3309,16 +3197,14 @@ define void @v_shuffle_v3f32_v4f32__7_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3366,15 +3252,14 @@ define void @v_shuffle_v3f32_v4f32__7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3420,15 +3305,14 @@ define void @v_shuffle_v3f32_v4f32__7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3475,15 +3359,14 @@ define void @v_shuffle_v3f32_v4f32__7_4_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3531,14 +3414,13 @@ define void @v_shuffle_v3f32_v4f32__7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v1 ; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -3586,15 +3468,14 @@ define void @v_shuffle_v3f32_v4f32__7_6_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3641,38 +3522,37 @@ define void @v_shuffle_v3f32_v4f32__u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__u_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__u_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__u_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -3745,11 +3625,10 @@ define void @v_shuffle_v3f32_v4f32__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3759,11 +3638,10 @@ define void @v_shuffle_v3f32_v4f32__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -3776,41 +3654,37 @@ define void @v_shuffle_v3f32_v4f32__2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__2_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -3823,13 +3697,13 @@ define void @v_shuffle_v3f32_v4f32__3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3839,11 +3713,10 @@ define void @v_shuffle_v3f32_v4f32__3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3853,11 +3726,10 @@ define void @v_shuffle_v3f32_v4f32__3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -3870,38 +3742,37 @@ define void @v_shuffle_v3f32_v4f32__4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__4_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -3933,14 +3804,13 @@ define void @v_shuffle_v3f32_v4f32__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3950,14 +3820,13 @@ define void @v_shuffle_v3f32_v4f32__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -3972,15 +3841,14 @@ define void @v_shuffle_v3f32_v4f32__6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3988,16 +3856,15 @@ define void @v_shuffle_v3f32_v4f32__6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4005,16 +3872,15 @@ define void @v_shuffle_v3f32_v4f32__6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4028,16 +3894,15 @@ define void @v_shuffle_v3f32_v4f32__7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4047,14 +3912,13 @@ define void @v_shuffle_v3f32_v4f32__7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4064,14 +3928,13 @@ define void @v_shuffle_v3f32_v4f32__7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4085,15 +3948,14 @@ define void @v_shuffle_v3f32_v4f32__7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4140,15 +4002,14 @@ define void @v_shuffle_v3f32_v4f32__7_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4197,15 +4058,14 @@ define void @v_shuffle_v3f32_v4f32__7_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4251,15 +4111,14 @@ define void @v_shuffle_v3f32_v4f32__7_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4308,15 +4167,14 @@ define void @v_shuffle_v3f32_v4f32__7_4_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4365,15 +4223,14 @@ define void @v_shuffle_v3f32_v4f32__7_5_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4420,15 +4277,14 @@ define void @v_shuffle_v3f32_v4f32__7_6_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4567,36 +4423,33 @@ define void @v_shuffle_v3f32_v4f32__2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__2_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__2_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__2_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4609,12 +4462,11 @@ define void @v_shuffle_v3f32_v4f32__3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__3_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4675,29 +4527,27 @@ define void @v_shuffle_v3f32_v4f32__5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4714,40 +4564,36 @@ define void @v_shuffle_v3f32_v4f32__6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__6_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__6_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4761,43 +4607,40 @@ define void @v_shuffle_v3f32_v4f32__7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4811,40 +4654,37 @@ define void @v_shuffle_v3f32_v4f32__7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx3 v1, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx3 v1, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4859,50 +4699,47 @@ define void @v_shuffle_v3f32_v4f32__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_0_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v2 -; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx3 v10, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_0_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v2 -; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: global_store_dwordx3 v10, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4917,14 +4754,14 @@ define void @v_shuffle_v3f32_v4f32__7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4970,14 +4807,14 @@ define void @v_shuffle_v3f32_v4f32__7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4985,15 +4822,14 @@ define void @v_shuffle_v3f32_v4f32__7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -5002,15 +4838,14 @@ define void @v_shuffle_v3f32_v4f32__7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 ; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -5026,16 +4861,14 @@ define void @v_shuffle_v3f32_v4f32__7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5049,10 +4882,8 @@ define void @v_shuffle_v3f32_v4f32__7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5066,10 +4897,9 @@ define void @v_shuffle_v3f32_v4f32__7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5083,43 +4913,40 @@ define void @v_shuffle_v3f32_v4f32__7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5133,42 +4960,40 @@ define void @v_shuffle_v3f32_v4f32__7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_6_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_6_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_6_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5342,9 +5167,8 @@ define void @v_shuffle_v3f32_v4f32__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v4 -; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5352,15 +5176,15 @@ define void @v_shuffle_v3f32_v4f32__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5368,15 +5192,15 @@ define void @v_shuffle_v3f32_v4f32__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5394,12 +5218,11 @@ define void @v_shuffle_v3f32_v4f32__3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v5 -; GFX900-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5537,10 +5360,10 @@ define void @v_shuffle_v3f32_v4f32__6_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5550,10 +5373,10 @@ define void @v_shuffle_v3f32_v4f32__6_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5563,10 +5386,10 @@ define void @v_shuffle_v3f32_v4f32__6_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5580,13 +5403,13 @@ define void @v_shuffle_v3f32_v4f32__7_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5626,13 +5449,12 @@ define void @v_shuffle_v3f32_v4f32__7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5673,16 +5495,14 @@ define void @v_shuffle_v3f32_v4f32__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5731,15 +5551,14 @@ define void @v_shuffle_v3f32_v4f32__7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5787,15 +5606,14 @@ define void @v_shuffle_v3f32_v4f32__7_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5844,16 +5662,14 @@ define void @v_shuffle_v3f32_v4f32__7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5863,14 +5679,13 @@ define void @v_shuffle_v3f32_v4f32__7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5880,14 +5695,13 @@ define void @v_shuffle_v3f32_v4f32__7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5901,14 +5715,13 @@ define void @v_shuffle_v3f32_v4f32__7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6162,9 +5975,9 @@ define void @v_shuffle_v3f32_v4f32__2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v6, v5 -; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6172,15 +5985,15 @@ define void @v_shuffle_v3f32_v4f32__2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6188,15 +6001,15 @@ define void @v_shuffle_v3f32_v4f32__2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -6214,12 +6027,12 @@ define void @v_shuffle_v3f32_v4f32__3_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v6 -; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6356,10 +6169,10 @@ define void @v_shuffle_v3f32_v4f32__6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6369,10 +6182,10 @@ define void @v_shuffle_v3f32_v4f32__6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6382,10 +6195,10 @@ define void @v_shuffle_v3f32_v4f32__6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -6399,13 +6212,13 @@ define void @v_shuffle_v3f32_v4f32__7_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_6_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6445,12 +6258,12 @@ define void @v_shuffle_v3f32_v4f32__7_u_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_u_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6489,15 +6302,14 @@ define void @v_shuffle_v3f32_v4f32__7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6545,15 +6357,14 @@ define void @v_shuffle_v3f32_v4f32__7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6601,15 +6412,14 @@ define void @v_shuffle_v3f32_v4f32__7_2_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6656,15 +6466,14 @@ define void @v_shuffle_v3f32_v4f32__7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6672,15 +6481,15 @@ define void @v_shuffle_v3f32_v4f32__7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6688,15 +6497,15 @@ define void @v_shuffle_v3f32_v4f32__7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v7 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -6759,12 +6568,13 @@ define void @v_shuffle_v3f32_v4f32__7_5_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_5_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6802,38 +6612,37 @@ define void @v_shuffle_v3f32_v4f32__u_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__u_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__u_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__u_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -6920,16 +6729,15 @@ define void @v_shuffle_v3f32_v4f32__1_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6937,16 +6745,15 @@ define void @v_shuffle_v3f32_v4f32__1_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -6967,9 +6774,9 @@ define void @v_shuffle_v3f32_v4f32__2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v6 -; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6979,14 +6786,13 @@ define void @v_shuffle_v3f32_v4f32__2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v7 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6996,14 +6802,13 @@ define void @v_shuffle_v3f32_v4f32__2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v7 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v7 +; GFX942-NEXT: v_mov_b32_e32 v4, v7 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7021,12 +6826,12 @@ define void @v_shuffle_v3f32_v4f32__3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v7 -; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7034,16 +6839,15 @@ define void @v_shuffle_v3f32_v4f32__3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v7 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7051,16 +6855,16 @@ define void @v_shuffle_v3f32_v4f32__3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v7 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7135,11 +6939,10 @@ define void @v_shuffle_v3f32_v4f32__5_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7149,11 +6952,10 @@ define void @v_shuffle_v3f32_v4f32__5_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7167,41 +6969,37 @@ define void @v_shuffle_v3f32_v4f32__6_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__6_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__6_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__6_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7215,12 +7013,12 @@ define void @v_shuffle_v3f32_v4f32__7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_u_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7261,15 +7059,14 @@ define void @v_shuffle_v3f32_v4f32__7_0_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7319,15 +7116,14 @@ define void @v_shuffle_v3f32_v4f32__7_1_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7375,15 +7171,14 @@ define void @v_shuffle_v3f32_v4f32__7_2_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7432,15 +7227,14 @@ define void @v_shuffle_v3f32_v4f32__7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7450,14 +7244,13 @@ define void @v_shuffle_v3f32_v4f32__7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v7 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7467,14 +7260,13 @@ define void @v_shuffle_v3f32_v4f32__7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v7 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v7 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7488,13 +7280,13 @@ define void @v_shuffle_v3f32_v4f32__7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_4_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7537,13 +7329,13 @@ define void @v_shuffle_v3f32_v4f32__7_5_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_5_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7583,12 +7375,13 @@ define void @v_shuffle_v3f32_v4f32__7_6_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_6_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll index f36f23a3a932d..6a0527f7cca24 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll @@ -776,15 +776,14 @@ define void @v_shuffle_v3i32_v2i32__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll index eacf77c931a68..65ceea2299e10 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll @@ -99,36 +99,33 @@ define void @v_shuffle_v3i32_v3i32__2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__2_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v3, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -194,36 +191,33 @@ define void @v_shuffle_v3i32_v3i32__5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v3, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -237,48 +231,45 @@ define void @v_shuffle_v3i32_v3i32__5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_0_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_0_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -292,46 +283,43 @@ define void @v_shuffle_v3i32_v3i32__5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_1_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_1_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -345,15 +333,14 @@ define void @v_shuffle_v3i32_v3i32__5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -361,15 +348,14 @@ define void @v_shuffle_v3i32_v3i32__5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -377,15 +363,14 @@ define void @v_shuffle_v3i32_v3i32__5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -401,37 +386,35 @@ define void @v_shuffle_v3i32_v3i32__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -445,36 +428,37 @@ define void @v_shuffle_v3i32_v3i32__5_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -488,38 +472,37 @@ define void @v_shuffle_v3i32_v3i32__5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -533,50 +516,46 @@ define void @v_shuffle_v3i32_v3i32__5_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -591,14 +570,14 @@ define void @v_shuffle_v3i32_v3i32__5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -606,16 +585,15 @@ define void @v_shuffle_v3i32_v3i32__5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -623,17 +601,15 @@ define void @v_shuffle_v3i32_v3i32__5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -648,14 +624,14 @@ define void @v_shuffle_v3i32_v3i32__5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -663,15 +639,14 @@ define void @v_shuffle_v3i32_v3i32__5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -679,16 +654,15 @@ define void @v_shuffle_v3i32_v3i32__5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -702,42 +676,40 @@ define void @v_shuffle_v3i32_v3i32__5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_5_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -751,42 +723,40 @@ define void @v_shuffle_v3i32_v3i32__5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -800,39 +770,40 @@ define void @v_shuffle_v3i32_v3i32__5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -858,26 +829,25 @@ define void @v_shuffle_v3i32_v3i32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -948,29 +918,27 @@ define void @v_shuffle_v3i32_v3i32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -983,42 +951,40 @@ define void @v_shuffle_v3i32_v3i32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__2_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1043,26 +1009,25 @@ define void @v_shuffle_v3i32_v3i32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__3_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__3_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1091,16 +1056,15 @@ define void @v_shuffle_v3i32_v3i32__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1108,17 +1072,15 @@ define void @v_shuffle_v3i32_v3i32__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx3 v7, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1133,50 +1095,45 @@ define void @v_shuffle_v3i32_v3i32__5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1190,49 +1147,42 @@ define void @v_shuffle_v3i32_v3i32__5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v3, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1246,52 +1196,46 @@ define void @v_shuffle_v3i32_v3i32__5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_1_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1305,50 +1249,46 @@ define void @v_shuffle_v3i32_v3i32__5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v6 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1362,50 +1302,46 @@ define void @v_shuffle_v3i32_v3i32__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_3_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1419,49 +1355,46 @@ define void @v_shuffle_v3i32_v3i32__5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_4_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_4_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1604,39 +1537,40 @@ define void @v_shuffle_v3i32_v3i32__2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__2_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__2_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__2_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1747,15 +1681,14 @@ define void @v_shuffle_v3i32_v3i32__5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1763,15 +1696,14 @@ define void @v_shuffle_v3i32_v3i32__5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1779,16 +1711,14 @@ define void @v_shuffle_v3i32_v3i32__5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1802,15 +1732,14 @@ define void @v_shuffle_v3i32_v3i32__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1818,15 +1747,14 @@ define void @v_shuffle_v3i32_v3i32__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1834,16 +1762,14 @@ define void @v_shuffle_v3i32_v3i32__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1857,15 +1783,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1873,16 +1798,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1890,17 +1814,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1915,15 +1837,14 @@ define void @v_shuffle_v3i32_v3i32__5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1933,14 +1854,13 @@ define void @v_shuffle_v3i32_v3i32__5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1950,14 +1870,13 @@ define void @v_shuffle_v3i32_v3i32__5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1974,13 +1893,12 @@ define void @v_shuffle_v3i32_v3i32__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1988,16 +1906,15 @@ define void @v_shuffle_v3i32_v3i32__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2005,16 +1922,15 @@ define void @v_shuffle_v3i32_v3i32__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2029,15 +1945,14 @@ define void @v_shuffle_v3i32_v3i32__5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2045,15 +1960,15 @@ define void @v_shuffle_v3i32_v3i32__5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2061,16 +1976,15 @@ define void @v_shuffle_v3i32_v3i32__5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2212,39 +2126,40 @@ define void @v_shuffle_v3i32_v3i32__2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2353,16 +2268,15 @@ define void @v_shuffle_v3i32_v3i32__5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2370,15 +2284,14 @@ define void @v_shuffle_v3i32_v3i32__5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2386,15 +2299,14 @@ define void @v_shuffle_v3i32_v3i32__5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2408,46 +2320,43 @@ define void @v_shuffle_v3i32_v3i32__5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2462,15 +2371,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2480,14 +2388,13 @@ define void @v_shuffle_v3i32_v3i32__5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2497,14 +2404,13 @@ define void @v_shuffle_v3i32_v3i32__5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2518,46 +2424,43 @@ define void @v_shuffle_v3i32_v3i32__5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_1_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_1_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2574,13 +2477,12 @@ define void @v_shuffle_v3i32_v3i32__5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2588,15 +2490,14 @@ define void @v_shuffle_v3i32_v3i32__5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2604,16 +2505,15 @@ define void @v_shuffle_v3i32_v3i32__5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2628,15 +2528,14 @@ define void @v_shuffle_v3i32_v3i32__5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2644,15 +2543,14 @@ define void @v_shuffle_v3i32_v3i32__5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2660,15 +2558,15 @@ define void @v_shuffle_v3i32_v3i32__5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2773,36 +2671,33 @@ define void @v_shuffle_v3i32_v3i32__2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__2_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v3, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2839,29 +2734,27 @@ define void @v_shuffle_v3i32_v3i32__4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2875,42 +2768,40 @@ define void @v_shuffle_v3i32_v3i32__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2924,13 +2815,12 @@ define void @v_shuffle_v3i32_v3i32__5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2970,14 +2860,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2985,16 +2875,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3002,16 +2891,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3026,14 +2914,14 @@ define void @v_shuffle_v3i32_v3i32__5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3041,14 +2929,14 @@ define void @v_shuffle_v3i32_v3i32__5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3056,15 +2944,15 @@ define void @v_shuffle_v3i32_v3i32__5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3079,14 +2967,14 @@ define void @v_shuffle_v3i32_v3i32__5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3094,16 +2982,15 @@ define void @v_shuffle_v3i32_v3i32__5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3111,16 +2998,15 @@ define void @v_shuffle_v3i32_v3i32__5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3147,29 +3033,27 @@ define void @v_shuffle_v3i32_v3i32__5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_4_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3340,12 +3224,11 @@ define void @v_shuffle_v3i32_v3i32__2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v4 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3355,13 +3238,13 @@ define void @v_shuffle_v3i32_v3i32__2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3371,13 +3254,13 @@ define void @v_shuffle_v3i32_v3i32__2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3480,39 +3363,40 @@ define void @v_shuffle_v3i32_v3i32__5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3526,38 +3410,37 @@ define void @v_shuffle_v3i32_v3i32__5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v3, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3572,15 +3455,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3588,16 +3470,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3605,16 +3486,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3629,15 +3509,14 @@ define void @v_shuffle_v3i32_v3i32__5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3645,15 +3524,14 @@ define void @v_shuffle_v3i32_v3i32__5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3661,16 +3539,15 @@ define void @v_shuffle_v3i32_v3i32__5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3685,15 +3562,14 @@ define void @v_shuffle_v3i32_v3i32__5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3701,16 +3577,15 @@ define void @v_shuffle_v3i32_v3i32__5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3718,16 +3593,15 @@ define void @v_shuffle_v3i32_v3i32__5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3741,41 +3615,40 @@ define void @v_shuffle_v3i32_v3i32__5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3945,15 +3818,15 @@ define void @v_shuffle_v3i32_v3i32__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3963,13 +3836,13 @@ define void @v_shuffle_v3i32_v3i32__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3979,13 +3852,13 @@ define void @v_shuffle_v3i32_v3i32__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4087,36 +3960,37 @@ define void @v_shuffle_v3i32_v3i32__5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_u_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_u_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v3, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4131,15 +4005,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4147,15 +4020,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4163,16 +4036,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4187,15 +4059,14 @@ define void @v_shuffle_v3i32_v3i32__5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4203,15 +4074,14 @@ define void @v_shuffle_v3i32_v3i32__5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4219,16 +4089,15 @@ define void @v_shuffle_v3i32_v3i32__5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4243,15 +4112,14 @@ define void @v_shuffle_v3i32_v3i32__5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4259,15 +4127,15 @@ define void @v_shuffle_v3i32_v3i32__5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4275,15 +4143,15 @@ define void @v_shuffle_v3i32_v3i32__5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4297,42 +4165,40 @@ define void @v_shuffle_v3i32_v3i32__5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_3_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_3_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4346,36 +4212,40 @@ define void @v_shuffle_v3i32_v3i32__5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll index 92d6c95c26599..3d838b9952147 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll @@ -99,36 +99,33 @@ define void @v_shuffle_v3i32_v4i32__2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__2_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -141,12 +138,11 @@ define void @v_shuffle_v3i32_v4i32__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -236,36 +232,33 @@ define void @v_shuffle_v3i32_v4i32__6_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__6_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__6_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__6_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -279,12 +272,11 @@ define void @v_shuffle_v3i32_v4i32__7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -322,16 +314,14 @@ define void @v_shuffle_v3i32_v4i32__7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -378,15 +368,14 @@ define void @v_shuffle_v3i32_v4i32__7_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -431,15 +420,14 @@ define void @v_shuffle_v3i32_v4i32__7_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -485,16 +473,14 @@ define void @v_shuffle_v3i32_v4i32__7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -508,9 +494,8 @@ define void @v_shuffle_v3i32_v4i32__7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -524,9 +509,9 @@ define void @v_shuffle_v3i32_v4i32__7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -540,13 +525,12 @@ define void @v_shuffle_v3i32_v4i32__7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -586,12 +570,12 @@ define void @v_shuffle_v3i32_v4i32__7_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -629,12 +613,12 @@ define void @v_shuffle_v3i32_v4i32__7_6_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_6_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -674,13 +658,12 @@ define void @v_shuffle_v3i32_v4i32__7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_7_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -691,9 +674,8 @@ define void @v_shuffle_v3i32_v4i32__7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -704,9 +686,8 @@ define void @v_shuffle_v3i32_v4i32__7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -720,17 +701,15 @@ define void @v_shuffle_v3i32_v4i32__7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_7_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v9, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -740,14 +719,12 @@ define void @v_shuffle_v3i32_v4i32__7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -757,15 +734,12 @@ define void @v_shuffle_v3i32_v4i32__7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -780,16 +754,14 @@ define void @v_shuffle_v3i32_v4i32__7_7_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -797,16 +769,15 @@ define void @v_shuffle_v3i32_v4i32__7_7_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -814,17 +785,15 @@ define void @v_shuffle_v3i32_v4i32__7_7_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -839,15 +808,14 @@ define void @v_shuffle_v3i32_v4i32__7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -855,15 +823,14 @@ define void @v_shuffle_v3i32_v4i32__7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -871,16 +838,15 @@ define void @v_shuffle_v3i32_v4i32__7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -895,15 +861,14 @@ define void @v_shuffle_v3i32_v4i32__7_7_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v7 -; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -911,16 +876,15 @@ define void @v_shuffle_v3i32_v4i32__7_7_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -928,16 +892,15 @@ define void @v_shuffle_v3i32_v4i32__7_7_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -951,43 +914,39 @@ define void @v_shuffle_v3i32_v4i32__7_7_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_7_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_7_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_7_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1001,42 +960,39 @@ define void @v_shuffle_v3i32_v4i32__7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_7_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1050,13 +1006,13 @@ define void @v_shuffle_v3i32_v4i32__7_7_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_7_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1096,13 +1052,13 @@ define void @v_shuffle_v3i32_v4i32__7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1112,11 +1068,10 @@ define void @v_shuffle_v3i32_v4i32__7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1126,11 +1081,10 @@ define void @v_shuffle_v3i32_v4i32__7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1145,38 +1099,36 @@ define void @v_shuffle_v3i32_v4i32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1247,29 +1199,27 @@ define void @v_shuffle_v3i32_v4i32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1285,40 +1235,36 @@ define void @v_shuffle_v3i32_v4i32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1331,43 +1277,40 @@ define void @v_shuffle_v3i32_v4i32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__3_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__3_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__3_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1381,38 +1324,36 @@ define void @v_shuffle_v3i32_v4i32__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__4_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__4_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1442,16 +1383,15 @@ define void @v_shuffle_v3i32_v4i32__5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1459,17 +1399,15 @@ define void @v_shuffle_v3i32_v4i32__5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1486,14 +1424,12 @@ define void @v_shuffle_v3i32_v4i32__6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1503,14 +1439,12 @@ define void @v_shuffle_v3i32_v4i32__6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1520,15 +1454,13 @@ define void @v_shuffle_v3i32_v4i32__6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1543,16 +1475,14 @@ define void @v_shuffle_v3i32_v4i32__7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1560,16 +1490,15 @@ define void @v_shuffle_v3i32_v4i32__7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1577,17 +1506,15 @@ define void @v_shuffle_v3i32_v4i32__7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1601,16 +1528,14 @@ define void @v_shuffle_v3i32_v4i32__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1618,15 +1543,14 @@ define void @v_shuffle_v3i32_v4i32__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: global_store_dwordx3 v1, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1634,16 +1558,15 @@ define void @v_shuffle_v3i32_v4i32__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: global_store_dwordx3 v1, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1657,17 +1580,15 @@ define void @v_shuffle_v3i32_v4i32__7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v9, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1675,16 +1596,15 @@ define void @v_shuffle_v3i32_v4i32__7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1692,17 +1612,15 @@ define void @v_shuffle_v3i32_v4i32__7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1716,50 +1634,48 @@ define void @v_shuffle_v3i32_v4i32__7_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v9, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v9 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx3 v10, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v7 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v9 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx3 v10, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1773,17 +1689,15 @@ define void @v_shuffle_v3i32_v4i32__7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v9, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1829,17 +1743,15 @@ define void @v_shuffle_v3i32_v4i32__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v9, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1847,16 +1759,15 @@ define void @v_shuffle_v3i32_v4i32__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1864,16 +1775,16 @@ define void @v_shuffle_v3i32_v4i32__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1887,16 +1798,15 @@ define void @v_shuffle_v3i32_v4i32__7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v9, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1943,16 +1853,15 @@ define void @v_shuffle_v3i32_v4i32__7_6_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_6_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v9, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1960,16 +1869,15 @@ define void @v_shuffle_v3i32_v4i32__7_6_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1977,17 +1885,16 @@ define void @v_shuffle_v3i32_v4i32__7_6_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2133,10 +2040,10 @@ define void @v_shuffle_v3i32_v4i32__2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2146,10 +2053,10 @@ define void @v_shuffle_v3i32_v4i32__2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2159,10 +2066,10 @@ define void @v_shuffle_v3i32_v4i32__2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2175,13 +2082,13 @@ define void @v_shuffle_v3i32_v4i32__3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__3_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2319,15 +2226,14 @@ define void @v_shuffle_v3i32_v4i32__6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2335,15 +2241,15 @@ define void @v_shuffle_v3i32_v4i32__6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2351,16 +2257,15 @@ define void @v_shuffle_v3i32_v4i32__6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2375,15 +2280,14 @@ define void @v_shuffle_v3i32_v4i32__7_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2430,16 +2334,14 @@ define void @v_shuffle_v3i32_v4i32__7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2486,17 +2388,14 @@ define void @v_shuffle_v3i32_v4i32__7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2546,16 +2445,14 @@ define void @v_shuffle_v3i32_v4i32__7_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2604,15 +2501,14 @@ define void @v_shuffle_v3i32_v4i32__7_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2659,16 +2555,14 @@ define void @v_shuffle_v3i32_v4i32__7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2717,15 +2611,14 @@ define void @v_shuffle_v3i32_v4i32__7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 ; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2773,15 +2666,14 @@ define void @v_shuffle_v3i32_v4i32__7_6_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2961,10 +2853,10 @@ define void @v_shuffle_v3i32_v4i32__2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2974,10 +2866,10 @@ define void @v_shuffle_v3i32_v4i32__2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2987,10 +2879,10 @@ define void @v_shuffle_v3i32_v4i32__2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -3003,13 +2895,13 @@ define void @v_shuffle_v3i32_v4i32__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3146,15 +3038,14 @@ define void @v_shuffle_v3i32_v4i32__6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3162,15 +3053,14 @@ define void @v_shuffle_v3i32_v4i32__6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3178,15 +3068,15 @@ define void @v_shuffle_v3i32_v4i32__6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -3201,15 +3091,14 @@ define void @v_shuffle_v3i32_v4i32__7_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3255,15 +3144,14 @@ define void @v_shuffle_v3i32_v4i32__7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3309,16 +3197,14 @@ define void @v_shuffle_v3i32_v4i32__7_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3366,15 +3252,14 @@ define void @v_shuffle_v3i32_v4i32__7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3420,15 +3305,14 @@ define void @v_shuffle_v3i32_v4i32__7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3475,15 +3359,14 @@ define void @v_shuffle_v3i32_v4i32__7_4_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3531,14 +3414,13 @@ define void @v_shuffle_v3i32_v4i32__7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v1 ; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -3586,15 +3468,14 @@ define void @v_shuffle_v3i32_v4i32__7_6_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3641,38 +3522,37 @@ define void @v_shuffle_v3i32_v4i32__u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__u_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__u_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__u_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -3745,11 +3625,10 @@ define void @v_shuffle_v3i32_v4i32__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3759,11 +3638,10 @@ define void @v_shuffle_v3i32_v4i32__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -3776,41 +3654,37 @@ define void @v_shuffle_v3i32_v4i32__2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__2_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -3823,13 +3697,13 @@ define void @v_shuffle_v3i32_v4i32__3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3839,11 +3713,10 @@ define void @v_shuffle_v3i32_v4i32__3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3853,11 +3726,10 @@ define void @v_shuffle_v3i32_v4i32__3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -3870,38 +3742,37 @@ define void @v_shuffle_v3i32_v4i32__4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__4_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -3933,14 +3804,13 @@ define void @v_shuffle_v3i32_v4i32__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3950,14 +3820,13 @@ define void @v_shuffle_v3i32_v4i32__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -3972,15 +3841,14 @@ define void @v_shuffle_v3i32_v4i32__6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3988,16 +3856,15 @@ define void @v_shuffle_v3i32_v4i32__6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4005,16 +3872,15 @@ define void @v_shuffle_v3i32_v4i32__6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4028,16 +3894,15 @@ define void @v_shuffle_v3i32_v4i32__7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4047,14 +3912,13 @@ define void @v_shuffle_v3i32_v4i32__7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4064,14 +3928,13 @@ define void @v_shuffle_v3i32_v4i32__7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4085,15 +3948,14 @@ define void @v_shuffle_v3i32_v4i32__7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4140,15 +4002,14 @@ define void @v_shuffle_v3i32_v4i32__7_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4197,15 +4058,14 @@ define void @v_shuffle_v3i32_v4i32__7_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4251,15 +4111,14 @@ define void @v_shuffle_v3i32_v4i32__7_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4308,15 +4167,14 @@ define void @v_shuffle_v3i32_v4i32__7_4_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4365,15 +4223,14 @@ define void @v_shuffle_v3i32_v4i32__7_5_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4420,15 +4277,14 @@ define void @v_shuffle_v3i32_v4i32__7_6_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4567,36 +4423,33 @@ define void @v_shuffle_v3i32_v4i32__2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__2_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__2_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__2_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4609,12 +4462,11 @@ define void @v_shuffle_v3i32_v4i32__3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__3_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4675,29 +4527,27 @@ define void @v_shuffle_v3i32_v4i32__5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4714,40 +4564,36 @@ define void @v_shuffle_v3i32_v4i32__6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__6_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__6_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4761,43 +4607,40 @@ define void @v_shuffle_v3i32_v4i32__7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4811,40 +4654,37 @@ define void @v_shuffle_v3i32_v4i32__7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx3 v1, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx3 v1, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4859,50 +4699,47 @@ define void @v_shuffle_v3i32_v4i32__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_0_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v2 -; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx3 v10, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_0_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v2 -; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: global_store_dwordx3 v10, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4917,14 +4754,14 @@ define void @v_shuffle_v3i32_v4i32__7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4970,14 +4807,14 @@ define void @v_shuffle_v3i32_v4i32__7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4985,15 +4822,14 @@ define void @v_shuffle_v3i32_v4i32__7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -5002,15 +4838,14 @@ define void @v_shuffle_v3i32_v4i32__7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 ; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -5026,16 +4861,14 @@ define void @v_shuffle_v3i32_v4i32__7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5049,10 +4882,8 @@ define void @v_shuffle_v3i32_v4i32__7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5066,10 +4897,9 @@ define void @v_shuffle_v3i32_v4i32__7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5083,43 +4913,40 @@ define void @v_shuffle_v3i32_v4i32__7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5133,42 +4960,40 @@ define void @v_shuffle_v3i32_v4i32__7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_6_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_6_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_6_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5342,9 +5167,8 @@ define void @v_shuffle_v3i32_v4i32__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v4 -; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5352,15 +5176,15 @@ define void @v_shuffle_v3i32_v4i32__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5368,15 +5192,15 @@ define void @v_shuffle_v3i32_v4i32__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5394,12 +5218,11 @@ define void @v_shuffle_v3i32_v4i32__3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v5 -; GFX900-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5537,10 +5360,10 @@ define void @v_shuffle_v3i32_v4i32__6_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5550,10 +5373,10 @@ define void @v_shuffle_v3i32_v4i32__6_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5563,10 +5386,10 @@ define void @v_shuffle_v3i32_v4i32__6_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5580,13 +5403,13 @@ define void @v_shuffle_v3i32_v4i32__7_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5626,13 +5449,12 @@ define void @v_shuffle_v3i32_v4i32__7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5673,16 +5495,14 @@ define void @v_shuffle_v3i32_v4i32__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5731,15 +5551,14 @@ define void @v_shuffle_v3i32_v4i32__7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5787,15 +5606,14 @@ define void @v_shuffle_v3i32_v4i32__7_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5844,16 +5662,14 @@ define void @v_shuffle_v3i32_v4i32__7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5863,14 +5679,13 @@ define void @v_shuffle_v3i32_v4i32__7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5880,14 +5695,13 @@ define void @v_shuffle_v3i32_v4i32__7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5901,14 +5715,13 @@ define void @v_shuffle_v3i32_v4i32__7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6162,9 +5975,9 @@ define void @v_shuffle_v3i32_v4i32__2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v6, v5 -; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6172,15 +5985,15 @@ define void @v_shuffle_v3i32_v4i32__2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6188,15 +6001,15 @@ define void @v_shuffle_v3i32_v4i32__2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -6214,12 +6027,12 @@ define void @v_shuffle_v3i32_v4i32__3_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v6 -; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6356,10 +6169,10 @@ define void @v_shuffle_v3i32_v4i32__6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6369,10 +6182,10 @@ define void @v_shuffle_v3i32_v4i32__6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6382,10 +6195,10 @@ define void @v_shuffle_v3i32_v4i32__6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -6399,13 +6212,13 @@ define void @v_shuffle_v3i32_v4i32__7_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_6_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6445,12 +6258,12 @@ define void @v_shuffle_v3i32_v4i32__7_u_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_u_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6489,15 +6302,14 @@ define void @v_shuffle_v3i32_v4i32__7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6545,15 +6357,14 @@ define void @v_shuffle_v3i32_v4i32__7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6601,15 +6412,14 @@ define void @v_shuffle_v3i32_v4i32__7_2_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6656,15 +6466,14 @@ define void @v_shuffle_v3i32_v4i32__7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6672,15 +6481,15 @@ define void @v_shuffle_v3i32_v4i32__7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6688,15 +6497,15 @@ define void @v_shuffle_v3i32_v4i32__7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v7 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -6759,12 +6568,13 @@ define void @v_shuffle_v3i32_v4i32__7_5_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_5_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6802,38 +6612,37 @@ define void @v_shuffle_v3i32_v4i32__u_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__u_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__u_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__u_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -6920,16 +6729,15 @@ define void @v_shuffle_v3i32_v4i32__1_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6937,16 +6745,15 @@ define void @v_shuffle_v3i32_v4i32__1_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -6967,9 +6774,9 @@ define void @v_shuffle_v3i32_v4i32__2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v6 -; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6979,14 +6786,13 @@ define void @v_shuffle_v3i32_v4i32__2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v7 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6996,14 +6802,13 @@ define void @v_shuffle_v3i32_v4i32__2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v7 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v7 +; GFX942-NEXT: v_mov_b32_e32 v4, v7 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7021,12 +6826,12 @@ define void @v_shuffle_v3i32_v4i32__3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v7 -; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7034,16 +6839,15 @@ define void @v_shuffle_v3i32_v4i32__3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v7 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7051,16 +6855,16 @@ define void @v_shuffle_v3i32_v4i32__3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v7 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7135,11 +6939,10 @@ define void @v_shuffle_v3i32_v4i32__5_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7149,11 +6952,10 @@ define void @v_shuffle_v3i32_v4i32__5_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7167,41 +6969,37 @@ define void @v_shuffle_v3i32_v4i32__6_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__6_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__6_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__6_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7215,12 +7013,12 @@ define void @v_shuffle_v3i32_v4i32__7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_u_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7261,15 +7059,14 @@ define void @v_shuffle_v3i32_v4i32__7_0_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7319,15 +7116,14 @@ define void @v_shuffle_v3i32_v4i32__7_1_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7375,15 +7171,14 @@ define void @v_shuffle_v3i32_v4i32__7_2_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7432,15 +7227,14 @@ define void @v_shuffle_v3i32_v4i32__7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7450,14 +7244,13 @@ define void @v_shuffle_v3i32_v4i32__7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v7 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7467,14 +7260,13 @@ define void @v_shuffle_v3i32_v4i32__7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v7 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v7 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7488,13 +7280,13 @@ define void @v_shuffle_v3i32_v4i32__7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_4_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7537,13 +7329,13 @@ define void @v_shuffle_v3i32_v4i32__7_5_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_5_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7583,12 +7375,13 @@ define void @v_shuffle_v3i32_v4i32__7_6_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_6_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll index bbca5039bb02c..82ec200dae107 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll @@ -58,39 +58,33 @@ define void @v_shuffle_v3i64_v2i64__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v2i64__1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__1_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__1_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -114,39 +108,33 @@ define void @v_shuffle_v3i64_v2i64__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -160,55 +148,42 @@ define void @v_shuffle_v3i64_v2i64__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_0_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_0_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -222,49 +197,43 @@ define void @v_shuffle_v3i64_v2i64__3_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_1_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_1_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -291,31 +260,27 @@ define void @v_shuffle_v3i64_v2i64__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -329,39 +294,40 @@ define void @v_shuffle_v3i64_v2i64__3_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -375,51 +341,51 @@ define void @v_shuffle_v3i64_v2i64__3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_3_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_3_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -433,51 +399,52 @@ define void @v_shuffle_v3i64_v2i64__3_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_3_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_3_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 +; GFX942-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -491,42 +458,42 @@ define void @v_shuffle_v3i64_v2i64__3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_3_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -540,42 +507,42 @@ define void @v_shuffle_v3i64_v2i64__3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -589,42 +556,36 @@ define void @v_shuffle_v3i64_v2i64__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v2i64__u_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -699,32 +660,28 @@ define void @v_shuffle_v3i64_v2i64__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -737,42 +694,36 @@ define void @v_shuffle_v3i64_v2i64__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v2i64__2_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -785,57 +736,45 @@ define void @v_shuffle_v3i64_v2i64__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -849,52 +788,45 @@ define void @v_shuffle_v3i64_v2i64__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -910,15 +842,15 @@ define void @v_shuffle_v3i64_v2i64__3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -927,15 +859,15 @@ define void @v_shuffle_v3i64_v2i64__3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -944,15 +876,15 @@ define void @v_shuffle_v3i64_v2i64__3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v7 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -966,52 +898,51 @@ define void @v_shuffle_v3i64_v2i64__3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1109,42 +1040,42 @@ define void @v_shuffle_v3i64_v2i64__1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v2i64__1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__1_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__1_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1200,16 +1131,14 @@ define void @v_shuffle_v3i64_v2i64__3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1217,16 +1146,14 @@ define void @v_shuffle_v3i64_v2i64__3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1234,16 +1161,14 @@ define void @v_shuffle_v3i64_v2i64__3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1258,16 +1183,14 @@ define void @v_shuffle_v3i64_v2i64__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1275,16 +1198,14 @@ define void @v_shuffle_v3i64_v2i64__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1292,16 +1213,14 @@ define void @v_shuffle_v3i64_v2i64__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1317,16 +1236,12 @@ define void @v_shuffle_v3i64_v2i64__3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1336,16 +1251,12 @@ define void @v_shuffle_v3i64_v2i64__3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1355,16 +1266,12 @@ define void @v_shuffle_v3i64_v2i64__3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -1379,57 +1286,52 @@ define void @v_shuffle_v3i64_v2i64__3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_2_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1493,39 +1395,33 @@ define void @v_shuffle_v3i64_v2i64__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v2i64__1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__1_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__1_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1563,32 +1459,28 @@ define void @v_shuffle_v3i64_v2i64__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1607,9 +1499,7 @@ define void @v_shuffle_v3i64_v2i64__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1621,9 +1511,7 @@ define void @v_shuffle_v3i64_v2i64__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1635,9 +1523,7 @@ define void @v_shuffle_v3i64_v2i64__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1651,57 +1537,45 @@ define void @v_shuffle_v3i64_v2i64__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_0_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_0_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_0_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1716,16 +1590,14 @@ define void @v_shuffle_v3i64_v2i64__3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1733,16 +1605,14 @@ define void @v_shuffle_v3i64_v2i64__3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1750,16 +1620,14 @@ define void @v_shuffle_v3i64_v2i64__3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1878,13 +1746,13 @@ define void @v_shuffle_v3i64_v2i64__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1895,13 +1763,13 @@ define void @v_shuffle_v3i64_v2i64__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1912,13 +1780,13 @@ define void @v_shuffle_v3i64_v2i64__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1975,42 +1843,36 @@ define void @v_shuffle_v3i64_v2i64__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2024,57 +1886,45 @@ define void @v_shuffle_v3i64_v2i64__3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_0_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_0_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_0_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2089,16 +1939,14 @@ define void @v_shuffle_v3i64_v2i64__3_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2106,16 +1954,14 @@ define void @v_shuffle_v3i64_v2i64__3_1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2123,16 +1969,14 @@ define void @v_shuffle_v3i64_v2i64__3_1_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2160,32 +2004,28 @@ define void @v_shuffle_v3i64_v2i64__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2543,59 +2383,21 @@ define void @s_shuffle_v3i64_v2i64__3_3_u() { } define void @s_shuffle_v3i64_v2i64__3_3_0() { -; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_3_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_3_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v2i64__3_3_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:3] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i64_v2i64__3_3_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=s"() %vec1 = call <2 x i64> asm "; def $0", "=s"() %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> @@ -2665,56 +2467,20 @@ define void @s_shuffle_v3i64_v2i64__3_3_1() { } define void @s_shuffle_v3i64_v2i64__3_3_2() { -; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_3_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s6 -; GFX900-NEXT: s_mov_b32 s11, s7 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_3_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s6 -; GFX90A-NEXT: s_mov_b32 s11, s7 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v2i64__3_3_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:3] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s2 -; GFX942-NEXT: s_mov_b32 s11, s3 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i64_v2i64__3_3_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s10, s14 +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=s"() %vec1 = call <2 x i64> asm "; def $0", "=s"() %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> @@ -2745,50 +2511,18 @@ define void @s_shuffle_v3i64_v2i64__3_3_3() { } define void @s_shuffle_v3i64_v2i64__u_0_0() { -; GFX900-LABEL: s_shuffle_v3i64_v2i64__u_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v2i64__u_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v2i64__u_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:3] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i64_v2i64__u_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=s"() %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) @@ -2817,56 +2551,20 @@ define void @s_shuffle_v3i64_v2i64__0_0_0() { } define void @s_shuffle_v3i64_v2i64__1_0_0() { -; GFX900-LABEL: s_shuffle_v3i64_v2i64__1_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v2i64__1_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v2i64__1_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:3] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i64_v2i64__1_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=s"() %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) @@ -2874,50 +2572,18 @@ define void @s_shuffle_v3i64_v2i64__1_0_0() { } define void @s_shuffle_v3i64_v2i64__2_0_0() { -; GFX900-LABEL: s_shuffle_v3i64_v2i64__2_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v2i64__2_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v2i64__2_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:3] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i64_v2i64__2_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=s"() %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) @@ -2929,17 +2595,15 @@ define void @s_shuffle_v3i64_v2i64__3_0_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -2949,17 +2613,15 @@ define void @s_shuffle_v3i64_v2i64__3_0_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -2969,17 +2631,15 @@ define void @s_shuffle_v3i64_v2i64__3_0_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:3] +; GFX942-NEXT: ; def s[12:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:7] +; GFX942-NEXT: ; def s[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -2996,15 +2656,13 @@ define void @s_shuffle_v3i64_v2i64__3_u_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -3014,15 +2672,13 @@ define void @s_shuffle_v3i64_v2i64__3_u_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -3032,15 +2688,13 @@ define void @s_shuffle_v3i64_v2i64__3_u_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:3] +; GFX942-NEXT: ; def s[12:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:7] +; GFX942-NEXT: ; def s[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -3057,17 +2711,15 @@ define void @s_shuffle_v3i64_v2i64__3_1_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s6 -; GFX900-NEXT: s_mov_b32 s11, s7 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -3077,17 +2729,15 @@ define void @s_shuffle_v3i64_v2i64__3_1_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s6 -; GFX90A-NEXT: s_mov_b32 s11, s7 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -3097,17 +2747,15 @@ define void @s_shuffle_v3i64_v2i64__3_1_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:3] +; GFX942-NEXT: ; def s[12:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:7] +; GFX942-NEXT: ; def s[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s2 -; GFX942-NEXT: s_mov_b32 s11, s3 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -3129,12 +2777,10 @@ define void @s_shuffle_v3i64_v2i64__3_2_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -3149,12 +2795,10 @@ define void @s_shuffle_v3i64_v2i64__3_2_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -3164,17 +2808,15 @@ define void @s_shuffle_v3i64_v2i64__3_2_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:3] +; GFX942-NEXT: ; def s[12:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:7] +; GFX942-NEXT: ; def s[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -3634,56 +3276,20 @@ define void @s_shuffle_v3i64_v2i64__2_2_2() { } define void @s_shuffle_v3i64_v2i64__3_2_2() { -; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v2i64__3_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:3] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i64_v2i64__3_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=s"() %vec1 = call <2 x i64> asm "; def $0", "=s"() %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> @@ -3692,50 +3298,18 @@ define void @s_shuffle_v3i64_v2i64__3_2_2() { } define void @s_shuffle_v3i64_v2i64__3_u_2() { -; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_u_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_u_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v2i64__3_u_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:3] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i64_v2i64__3_u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=s"() %vec1 = call <2 x i64> asm "; def $0", "=s"() %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> @@ -3787,14 +3361,12 @@ define void @s_shuffle_v3i64_v2i64__3_0_2() { ; GFX942-NEXT: ; def s[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:7] +; GFX942-NEXT: ; def s[12:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 ; GFX942-NEXT: s_mov_b32 s10, s0 ; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -3807,59 +3379,21 @@ define void @s_shuffle_v3i64_v2i64__3_0_2() { } define void @s_shuffle_v3i64_v2i64__3_1_2() { -; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_1_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_1_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v2i64__3_1_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:3] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i64_v2i64__3_1_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=s"() %vec1 = call <2 x i64> asm "; def $0", "=s"() %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v3i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v3i64.ll index f15dd7d2772e5..2fca4c2f1ff3a 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v3i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v3i64.ll @@ -100,39 +100,33 @@ define void @v_shuffle_v3i64_v3i64__2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__2_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -196,39 +190,33 @@ define void @v_shuffle_v3i64_v3i64__5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -242,55 +230,42 @@ define void @v_shuffle_v3i64_v3i64__5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_0_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_0_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -304,49 +279,43 @@ define void @v_shuffle_v3i64_v3i64__5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_1_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_1_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -360,49 +329,43 @@ define void @v_shuffle_v3i64_v3i64__5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -416,45 +379,40 @@ define void @v_shuffle_v3i64_v3i64__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -468,39 +426,40 @@ define void @v_shuffle_v3i64_v3i64__5_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -514,39 +473,40 @@ define void @v_shuffle_v3i64_v3i64__5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -560,51 +520,51 @@ define void @v_shuffle_v3i64_v3i64__5_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -618,51 +578,52 @@ define void @v_shuffle_v3i64_v3i64__5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_5_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_5_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_5_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -676,51 +637,52 @@ define void @v_shuffle_v3i64_v3i64__5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_5_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_5_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_5_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 +; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -734,42 +696,42 @@ define void @v_shuffle_v3i64_v3i64__5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_5_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -783,42 +745,42 @@ define void @v_shuffle_v3i64_v3i64__5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -832,42 +794,42 @@ define void @v_shuffle_v3i64_v3i64__5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -881,42 +843,36 @@ define void @v_shuffle_v3i64_v3i64__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__u_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1025,48 +981,42 @@ define void @v_shuffle_v3i64_v3i64__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__2_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1079,42 +1029,36 @@ define void @v_shuffle_v3i64_v3i64__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__3_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__3_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__3_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1185,57 +1129,45 @@ define void @v_shuffle_v3i64_v3i64__5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1249,52 +1181,45 @@ define void @v_shuffle_v3i64_v3i64__5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1308,52 +1233,51 @@ define void @v_shuffle_v3i64_v3i64__5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_1_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v8 +; GFX942-NEXT: v_mov_b32_e32 v7, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1367,51 +1291,51 @@ define void @v_shuffle_v3i64_v3i64__5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v10 +; GFX942-NEXT: v_mov_b32_e32 v7, v11 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1425,52 +1349,51 @@ define void @v_shuffle_v3i64_v3i64__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_3_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_3_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1484,51 +1407,51 @@ define void @v_shuffle_v3i64_v3i64__5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_4_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_4_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1674,42 +1597,42 @@ define void @v_shuffle_v3i64_v3i64__2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__2_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__2_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__2_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1823,16 +1746,14 @@ define void @v_shuffle_v3i64_v3i64__5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1840,16 +1761,14 @@ define void @v_shuffle_v3i64_v3i64__5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1857,16 +1776,14 @@ define void @v_shuffle_v3i64_v3i64__5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1881,16 +1798,14 @@ define void @v_shuffle_v3i64_v3i64__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1898,16 +1813,14 @@ define void @v_shuffle_v3i64_v3i64__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1915,16 +1828,14 @@ define void @v_shuffle_v3i64_v3i64__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1938,57 +1849,45 @@ define void @v_shuffle_v3i64_v3i64__5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_0_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_0_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -2002,51 +1901,51 @@ define void @v_shuffle_v3i64_v3i64__5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_2_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v8 +; GFX942-NEXT: v_mov_b32_e32 v7, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -2062,18 +1961,16 @@ define void @v_shuffle_v3i64_v3i64__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] -; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_3_1: @@ -2081,17 +1978,15 @@ define void @v_shuffle_v3i64_v3i64__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2100,18 +1995,16 @@ define void @v_shuffle_v3i64_v3i64__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -2125,16 +2018,16 @@ define void @v_shuffle_v3i64_v3i64__5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_4_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v8 -; GFX900-NEXT: v_mov_b32_e32 v5, v9 -; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 ; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -2142,16 +2035,16 @@ define void @v_shuffle_v3i64_v3i64__5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_4_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v8 -; GFX90A-NEXT: v_mov_b32_e32 v5, v9 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 ; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -2159,16 +2052,17 @@ define void @v_shuffle_v3i64_v3i64__5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_4_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v8 -; GFX942-NEXT: v_mov_b32_e32 v5, v9 +; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 ; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -2315,42 +2209,42 @@ define void @v_shuffle_v3i64_v3i64__2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -2464,16 +2358,14 @@ define void @v_shuffle_v3i64_v3i64__5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2481,16 +2373,14 @@ define void @v_shuffle_v3i64_v3i64__5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2498,16 +2388,14 @@ define void @v_shuffle_v3i64_v3i64__5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -2522,16 +2410,14 @@ define void @v_shuffle_v3i64_v3i64__5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2539,16 +2425,14 @@ define void @v_shuffle_v3i64_v3i64__5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2556,16 +2440,14 @@ define void @v_shuffle_v3i64_v3i64__5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -2581,17 +2463,13 @@ define void @v_shuffle_v3i64_v3i64__5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2600,17 +2478,13 @@ define void @v_shuffle_v3i64_v3i64__5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2619,17 +2493,13 @@ define void @v_shuffle_v3i64_v3i64__5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -2643,51 +2513,46 @@ define void @v_shuffle_v3i64_v3i64__5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_1_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_1_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -2701,57 +2566,52 @@ define void @v_shuffle_v3i64_v3i64__5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_3_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -2765,51 +2625,52 @@ define void @v_shuffle_v3i64_v3i64__5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_4_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v10 -; GFX900-NEXT: v_mov_b32_e32 v7, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_4_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v10 -; GFX90A-NEXT: v_mov_b32_e32 v7, v11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_4_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v10 -; GFX942-NEXT: v_mov_b32_e32 v7, v11 +; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -2915,39 +2776,33 @@ define void @v_shuffle_v3i64_v3i64__2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__2_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3020,48 +2875,42 @@ define void @v_shuffle_v3i64_v3i64__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3080,9 +2929,7 @@ define void @v_shuffle_v3i64_v3i64__5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3094,9 +2941,7 @@ define void @v_shuffle_v3i64_v3i64__5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3108,9 +2953,7 @@ define void @v_shuffle_v3i64_v3i64__5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3124,57 +2967,45 @@ define void @v_shuffle_v3i64_v3i64__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_0_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_0_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_0_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3189,16 +3020,14 @@ define void @v_shuffle_v3i64_v3i64__5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3206,16 +3035,14 @@ define void @v_shuffle_v3i64_v3i64__5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3223,16 +3050,14 @@ define void @v_shuffle_v3i64_v3i64__5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3247,16 +3072,14 @@ define void @v_shuffle_v3i64_v3i64__5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3264,16 +3087,14 @@ define void @v_shuffle_v3i64_v3i64__5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3281,16 +3102,14 @@ define void @v_shuffle_v3i64_v3i64__5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3304,42 +3123,42 @@ define void @v_shuffle_v3i64_v3i64__5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_4_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_4_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3516,13 +3335,13 @@ define void @v_shuffle_v3i64_v3i64__2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 ; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3533,13 +3352,13 @@ define void @v_shuffle_v3i64_v3i64__2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3550,13 +3369,13 @@ define void @v_shuffle_v3i64_v3i64__2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v8 +; GFX942-NEXT: v_mov_b32_e32 v7, v9 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3662,42 +3481,42 @@ define void @v_shuffle_v3i64_v3i64__5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3711,42 +3530,36 @@ define void @v_shuffle_v3i64_v3i64__5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 ; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3760,57 +3573,45 @@ define void @v_shuffle_v3i64_v3i64__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_0_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_0_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_0_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3825,16 +3626,14 @@ define void @v_shuffle_v3i64_v3i64__5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3842,16 +3641,14 @@ define void @v_shuffle_v3i64_v3i64__5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3859,16 +3656,14 @@ define void @v_shuffle_v3i64_v3i64__5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3883,16 +3678,14 @@ define void @v_shuffle_v3i64_v3i64__5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3900,16 +3693,14 @@ define void @v_shuffle_v3i64_v3i64__5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3917,16 +3708,14 @@ define void @v_shuffle_v3i64_v3i64__5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3940,48 +3729,42 @@ define void @v_shuffle_v3i64_v3i64__5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -4158,13 +3941,13 @@ define void @v_shuffle_v3i64_v3i64__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 ; GFX900-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4175,13 +3958,13 @@ define void @v_shuffle_v3i64_v3i64__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 ; GFX90A-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4192,13 +3975,13 @@ define void @v_shuffle_v3i64_v3i64__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v10 +; GFX942-NEXT: v_mov_b32_e32 v7, v11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -4304,42 +4087,36 @@ define void @v_shuffle_v3i64_v3i64__5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_u_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_u_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 ; GFX942-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -4353,57 +4130,45 @@ define void @v_shuffle_v3i64_v3i64__5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_0_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_0_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_0_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -4418,16 +4183,14 @@ define void @v_shuffle_v3i64_v3i64__5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4435,16 +4198,14 @@ define void @v_shuffle_v3i64_v3i64__5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4452,16 +4213,14 @@ define void @v_shuffle_v3i64_v3i64__5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -4476,16 +4235,14 @@ define void @v_shuffle_v3i64_v3i64__5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4493,16 +4250,14 @@ define void @v_shuffle_v3i64_v3i64__5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4510,16 +4265,14 @@ define void @v_shuffle_v3i64_v3i64__5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -4533,48 +4286,42 @@ define void @v_shuffle_v3i64_v3i64__5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_3_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_3_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_3_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -4588,42 +4335,42 @@ define void @v_shuffle_v3i64_v3i64__5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -4759,10 +4506,9 @@ define void @s_shuffle_v3i64_v3i64__2_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -4860,10 +4606,9 @@ define void @s_shuffle_v3i64_v3i64__5_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -4880,15 +4625,13 @@ define void @s_shuffle_v3i64_v3i64__5_0_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -4898,15 +4641,13 @@ define void @s_shuffle_v3i64_v3i64__5_0_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -4969,11 +4710,11 @@ define void @s_shuffle_v3i64_v3i64__5_1_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -5024,13 +4765,11 @@ define void @s_shuffle_v3i64_v3i64__5_2_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -5043,46 +4782,18 @@ define void @s_shuffle_v3i64_v3i64__5_2_u() { } define void @s_shuffle_v3i64_v3i64__5_3_u() { -; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_3_u: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_3_u: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_3_u: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_3_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> @@ -5095,10 +4806,10 @@ define void @s_shuffle_v3i64_v3i64__5_4_u() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND @@ -5111,50 +4822,18 @@ define void @s_shuffle_v3i64_v3i64__5_4_u() { } define void @s_shuffle_v3i64_v3i64__5_5_u() { -; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_5_u: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_5_u: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_5_u: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_5_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> @@ -5163,65 +4842,21 @@ define void @s_shuffle_v3i64_v3i64__5_5_u() { } define void @s_shuffle_v3i64_v3i64__5_5_0() { -; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_5_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_5_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_5_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_5_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> @@ -5234,17 +4869,15 @@ define void @s_shuffle_v3i64_v3i64__5_5_1() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -5254,17 +4887,15 @@ define void @s_shuffle_v3i64_v3i64__5_5_1() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -5274,16 +4905,14 @@ define void @s_shuffle_v3i64_v3i64__5_5_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 ; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] @@ -5304,12 +4933,10 @@ define void @s_shuffle_v3i64_v3i64__5_5_2() { ; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -5322,12 +4949,10 @@ define void @s_shuffle_v3i64_v3i64__5_5_2() { ; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -5339,13 +4964,12 @@ define void @s_shuffle_v3i64_v3i64__5_5_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -5358,52 +4982,20 @@ define void @s_shuffle_v3i64_v3i64__5_5_2() { } define void @s_shuffle_v3i64_v3i64__5_5_3() { -; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_5_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_5_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_5_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_5_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> @@ -5412,74 +5004,38 @@ define void @s_shuffle_v3i64_v3i64__5_5_3() { } define void @s_shuffle_v3i64_v3i64__5_5_4() { -; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_5_4: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_5_4: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_5_4: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - +; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_5_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + define void @s_shuffle_v3i64_v3i64__5_5_5() { ; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_5_5: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND @@ -5492,50 +5048,18 @@ define void @s_shuffle_v3i64_v3i64__5_5_5() { } define void @s_shuffle_v3i64_v3i64__u_0_0() { -; GFX900-LABEL: s_shuffle_v3i64_v3i64__u_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v3i64__u_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v3i64__u_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i64_v3i64__u_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) @@ -5564,56 +5088,20 @@ define void @s_shuffle_v3i64_v3i64__0_0_0() { } define void @s_shuffle_v3i64_v3i64__1_0_0() { -; GFX900-LABEL: s_shuffle_v3i64_v3i64__1_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v3i64__1_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v3i64__1_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i64_v3i64__1_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) @@ -5621,52 +5109,20 @@ define void @s_shuffle_v3i64_v3i64__1_0_0() { } define void @s_shuffle_v3i64_v3i64__2_0_0() { -; GFX900-LABEL: s_shuffle_v3i64_v3i64__2_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v3i64__2_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v3i64__2_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i64_v3i64__2_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) @@ -5674,50 +5130,18 @@ define void @s_shuffle_v3i64_v3i64__2_0_0() { } define void @s_shuffle_v3i64_v3i64__3_0_0() { -; GFX900-LABEL: s_shuffle_v3i64_v3i64__3_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v3i64__3_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v3i64__3_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i64_v3i64__3_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) @@ -5732,14 +5156,12 @@ define void @s_shuffle_v3i64_v3i64__4_0_0() { ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -5752,14 +5174,12 @@ define void @s_shuffle_v3i64_v3i64__4_0_0() { ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -5769,17 +5189,15 @@ define void @s_shuffle_v3i64_v3i64__4_0_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -5792,63 +5210,21 @@ define void @s_shuffle_v3i64_v3i64__4_0_0() { } define void @s_shuffle_v3i64_v3i64__5_0_0() { -; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> @@ -5857,307 +5233,140 @@ define void @s_shuffle_v3i64_v3i64__5_0_0() { } define void @s_shuffle_v3i64_v3i64__5_u_0() { -; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_u_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_u_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_u_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v3i64__5_1_0() { -; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_1_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s6 -; GFX900-NEXT: s_mov_b32 s11, s7 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_1_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s6 -; GFX90A-NEXT: s_mov_b32 s11, s7 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_1_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s2 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s11, s3 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v3i64__5_2_0() { -; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_2_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_2_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_2_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v3i64__5_3_0() { -; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_3_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_3_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_3_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v3i64__5_4_0() { -; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_4_0: +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_4_0: +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_4_0: +; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_1_0() { +; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_1_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s14 +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_2_0() { +; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_2_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s16 +; GFX9-NEXT: s_mov_b32 s11, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_3_0() { +; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_3_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_4_0() { +; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_4_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> @@ -6229,12 +5438,12 @@ define void @s_shuffle_v3i64_v3i64__2_1_1() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND @@ -6364,12 +5573,10 @@ define void @s_shuffle_v3i64_v3i64__5_1_1() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s10 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s10 ; GFX942-NEXT: s_mov_b32 s13, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] @@ -6387,15 +5594,13 @@ define void @s_shuffle_v3i64_v3i64__5_u_1() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -6405,15 +5610,13 @@ define void @s_shuffle_v3i64_v3i64__5_u_1() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -6446,17 +5649,15 @@ define void @s_shuffle_v3i64_v3i64__5_0_1() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -6466,17 +5667,15 @@ define void @s_shuffle_v3i64_v3i64__5_0_1() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -6507,61 +5706,23 @@ define void @s_shuffle_v3i64_v3i64__5_0_1() { } define void @s_shuffle_v3i64_v3i64__5_2_1() { -; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_2_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_2_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_2_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_2_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s16 +; GFX9-NEXT: s_mov_b32 s11, s17 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> @@ -6574,17 +5735,15 @@ define void @s_shuffle_v3i64_v3i64__5_3_1() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -6594,17 +5753,15 @@ define void @s_shuffle_v3i64_v3i64__5_3_1() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -6639,15 +5796,15 @@ define void @s_shuffle_v3i64_v3i64__5_4_1() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -6657,15 +5814,15 @@ define void @s_shuffle_v3i64_v3i64__5_4_1() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -6675,14 +5832,14 @@ define void @s_shuffle_v3i64_v3i64__5_4_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 ; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s6 +; GFX942-NEXT: s_mov_b32 s11, s7 ; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] @@ -6759,12 +5916,12 @@ define void @s_shuffle_v3i64_v3i64__2_2_2() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND @@ -6894,12 +6051,10 @@ define void @s_shuffle_v3i64_v3i64__5_2_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] @@ -6947,11 +6102,11 @@ define void @s_shuffle_v3i64_v3i64__5_u_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -6964,61 +6119,23 @@ define void @s_shuffle_v3i64_v3i64__5_u_2() { } define void @s_shuffle_v3i64_v3i64__5_0_2() { -; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_0_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_0_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_0_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_0_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s12, s16 +; GFX9-NEXT: s_mov_b32 s13, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> @@ -7061,11 +6178,11 @@ define void @s_shuffle_v3i64_v3i64__5_1_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -7116,13 +6233,12 @@ define void @s_shuffle_v3i64_v3i64__5_3_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -7139,15 +6255,13 @@ define void @s_shuffle_v3i64_v3i64__5_4_2() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -7157,15 +6271,13 @@ define void @s_shuffle_v3i64_v3i64__5_4_2() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -7177,13 +6289,12 @@ define void @s_shuffle_v3i64_v3i64__5_4_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 +; GFX942-NEXT: s_mov_b32 s10, s6 +; GFX942-NEXT: s_mov_b32 s11, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -7256,252 +6367,155 @@ define void @s_shuffle_v3i64_v3i64__1_3_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v3i64__1_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v3i64__1_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v3i64__2_3_3() { -; GFX900-LABEL: s_shuffle_v3i64_v3i64__2_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v3i64__2_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v3i64__2_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v3i64__3_3_3() { -; GFX9-LABEL: s_shuffle_v3i64_v3i64__3_3_3: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v3i64__4_3_3() { -; GFX900-LABEL: s_shuffle_v3i64_v3i64__4_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v3i64__4_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v3i64__4_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v3i64__5_3_3() { -; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_3_3: +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__1_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_3_3: +; GFX942-LABEL: s_shuffle_v3i64_v3i64__1_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v3i64__5_u_3() { -; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_u_3: +define void @s_shuffle_v3i64_v3i64__2_3_3() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__2_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_u_3: +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_u_3: +; GFX942-LABEL: s_shuffle_v3i64_v3i64__2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__3_3_3() { +; GFX9-LABEL: s_shuffle_v3i64_v3i64__3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__4_3_3() { +; GFX9-LABEL: s_shuffle_v3i64_v3i64__4_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_3_3() { +; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_u_3() { +; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_u_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> @@ -7514,15 +6528,15 @@ define void @s_shuffle_v3i64_v3i64__5_0_3() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -7532,15 +6546,15 @@ define void @s_shuffle_v3i64_v3i64__5_0_3() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -7609,13 +6623,12 @@ define void @s_shuffle_v3i64_v3i64__5_1_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -7668,15 +6681,15 @@ define void @s_shuffle_v3i64_v3i64__5_2_3() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:17] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s16 -; GFX942-NEXT: s_mov_b32 s9, s17 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -7689,52 +6702,20 @@ define void @s_shuffle_v3i64_v3i64__5_2_3() { } define void @s_shuffle_v3i64_v3i64__5_4_3() { -; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_4_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s6 -; GFX900-NEXT: s_mov_b32 s11, s7 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_4_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s6 -; GFX90A-NEXT: s_mov_b32 s11, s7 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_4_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s2 -; GFX942-NEXT: s_mov_b32 s11, s3 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_4_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> @@ -7892,12 +6873,12 @@ define void @s_shuffle_v3i64_v3i64__2_4_4() { ; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -7910,12 +6891,12 @@ define void @s_shuffle_v3i64_v3i64__2_4_4() { ; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -7925,15 +6906,16 @@ define void @s_shuffle_v3i64_v3i64__2_4_4() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s10, s2 +; GFX942-NEXT: s_mov_b32 s11, s3 +; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -7986,70 +6968,42 @@ define void @s_shuffle_v3i64_v3i64__4_4_4() { call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } - -define void @s_shuffle_v3i64_v3i64__5_4_4() { -; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_4_4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v3i64__5_u_4() { -; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_u_4: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_u_4: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_u_4: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] + +define void @s_shuffle_v3i64_v3i64__5_4_4() { +; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_u_4() { +; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_u_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> @@ -8062,17 +7016,15 @@ define void @s_shuffle_v3i64_v3i64__5_0_4() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8082,17 +7034,15 @@ define void @s_shuffle_v3i64_v3i64__5_0_4() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8161,13 +7111,12 @@ define void @s_shuffle_v3i64_v3i64__5_1_4() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -8220,17 +7169,15 @@ define void @s_shuffle_v3i64_v3i64__5_2_4() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:17] +; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s16 -; GFX942-NEXT: s_mov_b32 s9, s17 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -8243,52 +7190,20 @@ define void @s_shuffle_v3i64_v3i64__5_2_4() { } define void @s_shuffle_v3i64_v3i64__5_3_4() { -; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_3_4: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_3_4: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_3_4: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_3_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> @@ -8446,12 +7361,12 @@ define void @s_shuffle_v3i64_v3i64__2_5_5() { ; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8464,12 +7379,12 @@ define void @s_shuffle_v3i64_v3i64__2_5_5() { ; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8479,15 +7394,16 @@ define void @s_shuffle_v3i64_v3i64__2_5_5() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -8546,10 +7462,10 @@ define void @s_shuffle_v3i64_v3i64__5_u_5() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND @@ -8566,15 +7482,15 @@ define void @s_shuffle_v3i64_v3i64__5_0_5() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8584,15 +7500,15 @@ define void @s_shuffle_v3i64_v3i64__5_0_5() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8602,15 +7518,15 @@ define void @s_shuffle_v3i64_v3i64__5_0_5() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 ; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s12, s8 +; GFX942-NEXT: s_mov_b32 s13, s9 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -8630,12 +7546,10 @@ define void @s_shuffle_v3i64_v3i64__5_1_5() { ; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8648,12 +7562,10 @@ define void @s_shuffle_v3i64_v3i64__5_1_5() { ; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8665,13 +7577,12 @@ define void @s_shuffle_v3i64_v3i64__5_1_5() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 +; GFX942-NEXT: s_mov_b32 s12, s8 +; GFX942-NEXT: s_mov_b32 s13, s9 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -8688,15 +7599,15 @@ define void @s_shuffle_v3i64_v3i64__5_2_5() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8706,15 +7617,15 @@ define void @s_shuffle_v3i64_v3i64__5_2_5() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8726,13 +7637,13 @@ define void @s_shuffle_v3i64_v3i64__5_2_5() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s12, s8 +; GFX942-NEXT: s_mov_b32 s13, s9 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -8745,56 +7656,20 @@ define void @s_shuffle_v3i64_v3i64__5_2_5() { } define void @s_shuffle_v3i64_v3i64__5_3_5() { -; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_3_5: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_3_5: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_3_5: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_3_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> @@ -8807,10 +7682,12 @@ define void @s_shuffle_v3i64_v3i64__5_4_5() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll index 6e156d2d4a2f5..58a2146c00de9 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll @@ -139,39 +139,33 @@ define void @v_shuffle_v3i64_v4i64__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__3_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__3_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -275,39 +269,33 @@ define void @v_shuffle_v3i64_v4i64__7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -321,55 +309,42 @@ define void @v_shuffle_v3i64_v4i64__7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_0_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_0_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -383,49 +358,43 @@ define void @v_shuffle_v3i64_v4i64__7_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_1_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_1_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -439,49 +408,43 @@ define void @v_shuffle_v3i64_v4i64__7_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -495,49 +458,43 @@ define void @v_shuffle_v3i64_v4i64__7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -551,45 +508,40 @@ define void @v_shuffle_v3i64_v4i64__7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -603,39 +555,40 @@ define void @v_shuffle_v3i64_v4i64__7_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -649,39 +602,40 @@ define void @v_shuffle_v3i64_v4i64__7_6_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_6_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_6_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_6_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -695,39 +649,40 @@ define void @v_shuffle_v3i64_v4i64__7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_7_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_7_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_7_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -741,51 +696,51 @@ define void @v_shuffle_v3i64_v4i64__7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_7_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_7_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_7_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v16, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -799,51 +754,52 @@ define void @v_shuffle_v3i64_v4i64__7_7_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_7_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_7_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_7_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -857,51 +813,52 @@ define void @v_shuffle_v3i64_v4i64__7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_7_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_7_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_7_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -915,51 +872,52 @@ define void @v_shuffle_v3i64_v4i64__7_7_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_7_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_7_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_7_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 +; GFX942-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -973,42 +931,42 @@ define void @v_shuffle_v3i64_v4i64__7_7_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_7_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_7_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_7_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1022,42 +980,42 @@ define void @v_shuffle_v3i64_v4i64__7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_7_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1071,42 +1029,42 @@ define void @v_shuffle_v3i64_v4i64__7_7_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_7_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_7_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_7_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1120,42 +1078,42 @@ define void @v_shuffle_v3i64_v4i64__7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1169,42 +1127,36 @@ define void @v_shuffle_v3i64_v4i64__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__u_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1361,48 +1313,42 @@ define void @v_shuffle_v3i64_v4i64__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__3_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__3_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__3_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1415,42 +1361,36 @@ define void @v_shuffle_v3i64_v4i64__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__4_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__4_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__4_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1579,57 +1519,45 @@ define void @v_shuffle_v3i64_v4i64__7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v16, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1643,52 +1571,45 @@ define void @v_shuffle_v3i64_v4i64__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v16, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1702,52 +1623,51 @@ define void @v_shuffle_v3i64_v4i64__7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_1_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx2 v16, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v10 +; GFX942-NEXT: v_mov_b32_e32 v9, v11 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1761,51 +1681,51 @@ define void @v_shuffle_v3i64_v4i64__7_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v12 +; GFX90A-NEXT: v_mov_b32_e32 v9, v13 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: global_store_dwordx2 v16, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v12 +; GFX942-NEXT: v_mov_b32_e32 v9, v13 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1819,51 +1739,51 @@ define void @v_shuffle_v3i64_v4i64__7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v14 +; GFX900-NEXT: v_mov_b32_e32 v9, v15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_3_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v14 +; GFX90A-NEXT: v_mov_b32_e32 v9, v15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_3_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v16, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v14 +; GFX942-NEXT: v_mov_b32_e32 v9, v15 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1877,52 +1797,51 @@ define void @v_shuffle_v3i64_v4i64__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_4_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_4_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v16, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1936,51 +1855,51 @@ define void @v_shuffle_v3i64_v4i64__7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v16, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1994,51 +1913,51 @@ define void @v_shuffle_v3i64_v4i64__7_6_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_6_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v8 -; GFX900-NEXT: v_mov_b32_e32 v5, v9 -; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_6_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v8 -; GFX90A-NEXT: v_mov_b32_e32 v5, v9 -; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_6_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v8 -; GFX942-NEXT: v_mov_b32_e32 v5, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v16, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2232,42 +2151,42 @@ define void @v_shuffle_v3i64_v4i64__3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__3_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__3_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__3_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2439,16 +2358,14 @@ define void @v_shuffle_v3i64_v4i64__7_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2456,16 +2373,14 @@ define void @v_shuffle_v3i64_v4i64__7_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2473,16 +2388,14 @@ define void @v_shuffle_v3i64_v4i64__7_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2497,16 +2410,14 @@ define void @v_shuffle_v3i64_v4i64__7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2514,16 +2425,14 @@ define void @v_shuffle_v3i64_v4i64__7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2531,16 +2440,14 @@ define void @v_shuffle_v3i64_v4i64__7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2554,57 +2461,45 @@ define void @v_shuffle_v3i64_v4i64__7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_0_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_0_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v16, v[10:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2620,15 +2515,15 @@ define void @v_shuffle_v3i64_v4i64__7_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v14, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2637,15 +2532,15 @@ define void @v_shuffle_v3i64_v4i64__7_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v14, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2654,16 +2549,15 @@ define void @v_shuffle_v3i64_v4i64__7_2_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v10 +; GFX942-NEXT: v_mov_b32_e32 v9, v11 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2677,51 +2571,51 @@ define void @v_shuffle_v3i64_v4i64__7_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx2 v16, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_3_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v12 +; GFX90A-NEXT: v_mov_b32_e32 v9, v13 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx2 v16, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_3_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v12 +; GFX942-NEXT: v_mov_b32_e32 v9, v13 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2735,58 +2629,52 @@ define void @v_shuffle_v3i64_v4i64__7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_4_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_4_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_4_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2800,51 +2688,52 @@ define void @v_shuffle_v3i64_v4i64__7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_5_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v10 -; GFX900-NEXT: v_mov_b32_e32 v5, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_5_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v10 -; GFX90A-NEXT: v_mov_b32_e32 v5, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_5_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v10 -; GFX942-NEXT: v_mov_b32_e32 v5, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2858,51 +2747,52 @@ define void @v_shuffle_v3i64_v4i64__7_6_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_6_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v10 -; GFX900-NEXT: v_mov_b32_e32 v7, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_6_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v10 -; GFX90A-NEXT: v_mov_b32_e32 v7, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_6_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v10 -; GFX942-NEXT: v_mov_b32_e32 v7, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -3096,42 +2986,42 @@ define void @v_shuffle_v3i64_v4i64__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__3_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__3_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -3303,16 +3193,14 @@ define void @v_shuffle_v3i64_v4i64__7_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3320,16 +3208,14 @@ define void @v_shuffle_v3i64_v4i64__7_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3337,16 +3223,14 @@ define void @v_shuffle_v3i64_v4i64__7_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -3361,16 +3245,14 @@ define void @v_shuffle_v3i64_v4i64__7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v12 -; GFX900-NEXT: v_mov_b32_e32 v1, v13 -; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3378,16 +3260,14 @@ define void @v_shuffle_v3i64_v4i64__7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v12 -; GFX90A-NEXT: v_mov_b32_e32 v1, v13 -; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3395,16 +3275,14 @@ define void @v_shuffle_v3i64_v4i64__7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v12 -; GFX942-NEXT: v_mov_b32_e32 v1, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -3418,57 +3296,45 @@ define void @v_shuffle_v3i64_v4i64__7_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_0_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_0_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v16, v[12:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -3482,51 +3348,46 @@ define void @v_shuffle_v3i64_v4i64__7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v12 -; GFX900-NEXT: v_mov_b32_e32 v1, v13 -; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_1_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v12 -; GFX90A-NEXT: v_mov_b32_e32 v1, v13 -; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_1_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v12 -; GFX942-NEXT: v_mov_b32_e32 v1, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx2 v14, v[10:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -3540,51 +3401,51 @@ define void @v_shuffle_v3i64_v4i64__7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_3_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v16, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v16, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v10 +; GFX942-NEXT: v_mov_b32_e32 v9, v11 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -3598,57 +3459,52 @@ define void @v_shuffle_v3i64_v4i64__7_4_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_4_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v12 -; GFX900-NEXT: v_mov_b32_e32 v1, v13 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_4_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v12 -; GFX90A-NEXT: v_mov_b32_e32 v1, v13 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_4_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v12 -; GFX942-NEXT: v_mov_b32_e32 v1, v13 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -3662,51 +3518,52 @@ define void @v_shuffle_v3i64_v4i64__7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_5_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v12 -; GFX900-NEXT: v_mov_b32_e32 v7, v13 -; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_5_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v12 -; GFX90A-NEXT: v_mov_b32_e32 v7, v13 -; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_5_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v12 -; GFX942-NEXT: v_mov_b32_e32 v7, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -3720,51 +3577,52 @@ define void @v_shuffle_v3i64_v4i64__7_6_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_6_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v12 -; GFX900-NEXT: v_mov_b32_e32 v9, v13 -; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_6_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v12 -; GFX90A-NEXT: v_mov_b32_e32 v9, v13 -; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_6_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v8, v12 -; GFX942-NEXT: v_mov_b32_e32 v9, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -3958,42 +3816,42 @@ define void @v_shuffle_v3i64_v4i64__3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -4165,16 +4023,14 @@ define void @v_shuffle_v3i64_v4i64__7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4182,16 +4038,14 @@ define void @v_shuffle_v3i64_v4i64__7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4199,16 +4053,14 @@ define void @v_shuffle_v3i64_v4i64__7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -4223,16 +4075,14 @@ define void @v_shuffle_v3i64_v4i64__7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v14 -; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4240,16 +4090,14 @@ define void @v_shuffle_v3i64_v4i64__7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v14 -; GFX90A-NEXT: v_mov_b32_e32 v1, v15 -; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4257,16 +4105,14 @@ define void @v_shuffle_v3i64_v4i64__7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v14 -; GFX942-NEXT: v_mov_b32_e32 v1, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -4280,57 +4126,45 @@ define void @v_shuffle_v3i64_v4i64__7_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_0_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v14 -; GFX90A-NEXT: v_mov_b32_e32 v3, v15 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_0_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v14 -; GFX942-NEXT: v_mov_b32_e32 v3, v15 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx2 v16, v[14:15], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -4344,51 +4178,46 @@ define void @v_shuffle_v3i64_v4i64__7_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_1_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v14 -; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_1_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v14 -; GFX90A-NEXT: v_mov_b32_e32 v1, v15 -; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_1_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v14 -; GFX942-NEXT: v_mov_b32_e32 v1, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v14, v[12:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -4402,51 +4231,46 @@ define void @v_shuffle_v3i64_v4i64__7_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v14 -; GFX900-NEXT: v_mov_b32_e32 v3, v15 -; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v14 -; GFX90A-NEXT: v_mov_b32_e32 v3, v15 -; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v14 -; GFX942-NEXT: v_mov_b32_e32 v3, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -4460,57 +4284,52 @@ define void @v_shuffle_v3i64_v4i64__7_4_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_4_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v14 -; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v14 -; GFX90A-NEXT: v_mov_b32_e32 v1, v15 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_4_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v14 -; GFX942-NEXT: v_mov_b32_e32 v1, v15 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -4524,51 +4343,52 @@ define void @v_shuffle_v3i64_v4i64__7_5_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_5_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v14 -; GFX900-NEXT: v_mov_b32_e32 v9, v15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v14 -; GFX90A-NEXT: v_mov_b32_e32 v9, v15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v14 -; GFX942-NEXT: v_mov_b32_e32 v9, v15 +; GFX942-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -4582,51 +4402,52 @@ define void @v_shuffle_v3i64_v4i64__7_6_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_6_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v14 -; GFX900-NEXT: v_mov_b32_e32 v11, v15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_6_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v14 -; GFX90A-NEXT: v_mov_b32_e32 v11, v15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_6_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v14 -; GFX942-NEXT: v_mov_b32_e32 v11, v15 +; GFX942-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -4771,39 +4592,33 @@ define void @v_shuffle_v3i64_v4i64__3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__3_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__3_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__3_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -4925,48 +4740,42 @@ define void @v_shuffle_v3i64_v4i64__7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -4985,9 +4794,7 @@ define void @v_shuffle_v3i64_v4i64__7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4999,9 +4806,7 @@ define void @v_shuffle_v3i64_v4i64__7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5013,9 +4818,7 @@ define void @v_shuffle_v3i64_v4i64__7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -5029,57 +4832,45 @@ define void @v_shuffle_v3i64_v4i64__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_0_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_0_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -5094,16 +4885,14 @@ define void @v_shuffle_v3i64_v4i64__7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5111,16 +4900,14 @@ define void @v_shuffle_v3i64_v4i64__7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5128,16 +4915,14 @@ define void @v_shuffle_v3i64_v4i64__7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -5152,16 +4937,14 @@ define void @v_shuffle_v3i64_v4i64__7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx2 v14, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5169,16 +4952,14 @@ define void @v_shuffle_v3i64_v4i64__7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx2 v14, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5186,16 +4967,14 @@ define void @v_shuffle_v3i64_v4i64__7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -5210,16 +4989,14 @@ define void @v_shuffle_v3i64_v4i64__7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5227,16 +5004,14 @@ define void @v_shuffle_v3i64_v4i64__7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5244,16 +5019,14 @@ define void @v_shuffle_v3i64_v4i64__7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[8:9], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -5267,42 +5040,42 @@ define void @v_shuffle_v3i64_v4i64__7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -5316,42 +5089,42 @@ define void @v_shuffle_v3i64_v4i64__7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_6_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_6_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_6_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -5586,13 +5359,13 @@ define void @v_shuffle_v3i64_v4i64__3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5603,13 +5376,13 @@ define void @v_shuffle_v3i64_v4i64__3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v6 -; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5620,13 +5393,13 @@ define void @v_shuffle_v3i64_v4i64__3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v10 +; GFX942-NEXT: v_mov_b32_e32 v9, v11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: global_store_dwordx2 v16, v[10:11], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v8, v6 -; GFX942-NEXT: v_mov_b32_e32 v9, v7 -; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -5781,42 +5554,42 @@ define void @v_shuffle_v3i64_v4i64__7_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -5830,42 +5603,36 @@ define void @v_shuffle_v3i64_v4i64__7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 ; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_u_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 ; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_u_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 ; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -5879,57 +5646,45 @@ define void @v_shuffle_v3i64_v4i64__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_0_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v16, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_0_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v16, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -5944,16 +5699,14 @@ define void @v_shuffle_v3i64_v4i64__7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v14, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5961,16 +5714,14 @@ define void @v_shuffle_v3i64_v4i64__7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v14, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5978,16 +5729,14 @@ define void @v_shuffle_v3i64_v4i64__7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v14, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -6002,16 +5751,14 @@ define void @v_shuffle_v3i64_v4i64__7_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6019,16 +5766,14 @@ define void @v_shuffle_v3i64_v4i64__7_2_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6036,16 +5781,14 @@ define void @v_shuffle_v3i64_v4i64__7_2_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -6060,16 +5803,14 @@ define void @v_shuffle_v3i64_v4i64__7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6077,16 +5818,14 @@ define void @v_shuffle_v3i64_v4i64__7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6094,16 +5833,14 @@ define void @v_shuffle_v3i64_v4i64__7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[10:11], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -6117,48 +5854,42 @@ define void @v_shuffle_v3i64_v4i64__7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -6172,42 +5903,42 @@ define void @v_shuffle_v3i64_v4i64__7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_6_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_6_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_6_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -6442,13 +6173,13 @@ define void @v_shuffle_v3i64_v4i64__3_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v6 -; GFX900-NEXT: v_mov_b32_e32 v11, v7 ; GFX900-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6459,13 +6190,13 @@ define void @v_shuffle_v3i64_v4i64__3_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v12 +; GFX90A-NEXT: v_mov_b32_e32 v9, v13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, v6 -; GFX90A-NEXT: v_mov_b32_e32 v11, v7 ; GFX90A-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6476,13 +6207,13 @@ define void @v_shuffle_v3i64_v4i64__3_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v12 +; GFX942-NEXT: v_mov_b32_e32 v9, v13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: global_store_dwordx2 v16, v[12:13], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v10, v6 -; GFX942-NEXT: v_mov_b32_e32 v11, v7 -; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -6637,42 +6368,42 @@ define void @v_shuffle_v3i64_v4i64__7_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_6_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_6_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_6_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -6686,42 +6417,36 @@ define void @v_shuffle_v3i64_v4i64__7_u_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_u_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 ; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_u_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 ; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_u_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 ; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -6735,57 +6460,45 @@ define void @v_shuffle_v3i64_v4i64__7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_0_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v16, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_0_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v16, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -6800,16 +6513,14 @@ define void @v_shuffle_v3i64_v4i64__7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6817,16 +6528,14 @@ define void @v_shuffle_v3i64_v4i64__7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6834,16 +6543,14 @@ define void @v_shuffle_v3i64_v4i64__7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -6858,16 +6565,14 @@ define void @v_shuffle_v3i64_v4i64__7_2_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6875,16 +6580,14 @@ define void @v_shuffle_v3i64_v4i64__7_2_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6892,16 +6595,14 @@ define void @v_shuffle_v3i64_v4i64__7_2_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[10:11], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -6916,16 +6617,14 @@ define void @v_shuffle_v3i64_v4i64__7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6933,16 +6632,14 @@ define void @v_shuffle_v3i64_v4i64__7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6950,16 +6647,14 @@ define void @v_shuffle_v3i64_v4i64__7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[12:13], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -6973,48 +6668,42 @@ define void @v_shuffle_v3i64_v4i64__7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_4_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_4_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_4_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -7028,42 +6717,42 @@ define void @v_shuffle_v3i64_v4i64__7_5_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_5_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_5_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_5_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -7298,13 +6987,13 @@ define void @v_shuffle_v3i64_v4i64__3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v14 +; GFX900-NEXT: v_mov_b32_e32 v9, v15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v6 -; GFX900-NEXT: v_mov_b32_e32 v13, v7 ; GFX900-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7315,13 +7004,13 @@ define void @v_shuffle_v3i64_v4i64__3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v14 +; GFX90A-NEXT: v_mov_b32_e32 v9, v15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, v6 -; GFX90A-NEXT: v_mov_b32_e32 v13, v7 ; GFX90A-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7332,13 +7021,13 @@ define void @v_shuffle_v3i64_v4i64__3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v14 +; GFX942-NEXT: v_mov_b32_e32 v9, v15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: global_store_dwordx2 v16, v[14:15], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v12, v6 -; GFX942-NEXT: v_mov_b32_e32 v13, v7 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -7493,42 +7182,36 @@ define void @v_shuffle_v3i64_v4i64__7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_u_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 ; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_u_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 ; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_u_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 ; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -7542,57 +7225,45 @@ define void @v_shuffle_v3i64_v4i64__7_0_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_0_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_0_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -7607,16 +7278,14 @@ define void @v_shuffle_v3i64_v4i64__7_1_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v14, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7624,16 +7293,14 @@ define void @v_shuffle_v3i64_v4i64__7_1_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v14, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7641,16 +7308,14 @@ define void @v_shuffle_v3i64_v4i64__7_1_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v14, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -7665,16 +7330,14 @@ define void @v_shuffle_v3i64_v4i64__7_2_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7682,16 +7345,14 @@ define void @v_shuffle_v3i64_v4i64__7_2_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7699,16 +7360,14 @@ define void @v_shuffle_v3i64_v4i64__7_2_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[12:13], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -7723,16 +7382,14 @@ define void @v_shuffle_v3i64_v4i64__7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7740,16 +7397,14 @@ define void @v_shuffle_v3i64_v4i64__7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7757,16 +7412,14 @@ define void @v_shuffle_v3i64_v4i64__7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[14:15], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -7780,48 +7433,42 @@ define void @v_shuffle_v3i64_v4i64__7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_4_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_4_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_4_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -7835,42 +7482,42 @@ define void @v_shuffle_v3i64_v4i64__7_5_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_5_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_5_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_5_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -7884,42 +7531,42 @@ define void @v_shuffle_v3i64_v4i64__7_6_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_6_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_6_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_6_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -8055,10 +7702,9 @@ define void @s_shuffle_v3i64_v4i64__2_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -8201,10 +7847,9 @@ define void @s_shuffle_v3i64_v4i64__6_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -8477,15 +8122,14 @@ define void @s_shuffle_v3i64_v4i64__7_3_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -8622,134 +8266,62 @@ define void @s_shuffle_v3i64_v4i64__7_6_u() { } define void @s_shuffle_v3i64_v4i64__7_7_u() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_7_u: +; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_7_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_7_0() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_7_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_7_1() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_7_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_7_u: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_7_u: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v4i64__7_7_0() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_7_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_7_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_7_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v4i64__7_7_1() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_7_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8759,17 +8331,15 @@ define void @s_shuffle_v3i64_v4i64__7_7_1() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8838,13 +8408,12 @@ define void @s_shuffle_v3i64_v4i64__7_7_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -8899,15 +8468,13 @@ define void @s_shuffle_v3i64_v4i64__7_7_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s14 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -8920,52 +8487,20 @@ define void @s_shuffle_v3i64_v4i64__7_7_3() { } define void @s_shuffle_v3i64_v4i64__7_7_4() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_7_4: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_7_4: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_7_4: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_7_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> @@ -8974,1103 +8509,58 @@ define void @s_shuffle_v3i64_v4i64__7_7_4() { } define void @s_shuffle_v3i64_v4i64__7_7_5() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_7_5: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_7_5: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_7_5: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v4i64__7_7_6() { -; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_7_6: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s14 -; GFX9-NEXT: s_mov_b32 s9, s15 -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v4i64__7_7_7() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_7_7: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_7_7: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_7_7: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v4i64__u_0_0() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__u_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__u_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__u_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v4i64__0_0_0() { -; GFX9-LABEL: s_shuffle_v3i64_v4i64__0_0_0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s11, s9 -; GFX9-NEXT: s_mov_b32 s12, s8 -; GFX9-NEXT: s_mov_b32 s13, s9 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> zeroinitializer - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v4i64__1_0_0() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__1_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v4i64__2_0_0() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__2_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v4i64__3_0_0() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__3_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v4i64__4_0_0() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__4_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__4_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__4_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v4i64__5_0_0() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v4i64__6_0_0() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v4i64__7_0_0() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v4i64__7_u_0() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v4i64__7_1_0() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s6 -; GFX900-NEXT: s_mov_b32 s11, s7 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s6 -; GFX90A-NEXT: s_mov_b32 s11, s7 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s2 -; GFX942-NEXT: s_mov_b32 s11, s3 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v4i64__7_2_0() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v4i64__7_3_0() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_3_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v4i64__7_4_0() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v4i64__7_5_0() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v4i64__7_6_0() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_7_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__u_1_1() { -; GFX9-LABEL: s_shuffle_v3i64_v4i64__u_1_1: +define void @s_shuffle_v3i64_v4i64__7_7_6() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_7_6: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:15] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s10, s14 +; GFX9-NEXT: s_mov_b32 s11, s15 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__0_1_1() { -; GFX9-LABEL: s_shuffle_v3i64_v4i64__0_1_1: +define void @s_shuffle_v3i64_v4i64__7_7_7() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_7_7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s12, s10 ; GFX9-NEXT: s_mov_b32 s13, s11 ; GFX9-NEXT: ;;#ASMSTART @@ -10078,473 +8568,397 @@ define void @s_shuffle_v3i64_v4i64__0_1_1() { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__1_1_1() { -; GFX9-LABEL: s_shuffle_v3i64_v4i64__1_1_1: +define void @s_shuffle_v3i64_v4i64__u_0_0() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__u_0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[12:19] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__2_1_1() { -; GFX9-LABEL: s_shuffle_v3i64_v4i64__2_1_1: +define void @s_shuffle_v3i64_v4i64__0_0_0() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__0_0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:15] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> zeroinitializer call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__3_1_1() { -; GFX9-LABEL: s_shuffle_v3i64_v4i64__3_1_1: +define void @s_shuffle_v3i64_v4i64__1_0_0() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__1_0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[12:19] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_mov_b32 s8, s14 ; GFX9-NEXT: s_mov_b32 s9, s15 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__4_1_1() { -; GFX9-LABEL: s_shuffle_v3i64_v4i64__4_1_1: +define void @s_shuffle_v3i64_v4i64__2_0_0() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__2_0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__5_1_1() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v3i64_v4i64__3_0_0() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__3_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__6_1_1() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v3i64_v4i64__4_0_0() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__4_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_1_1() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_1: +define void @s_shuffle_v3i64_v4i64__5_0_0() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_1: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_1: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_u_1() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_1: +define void @s_shuffle_v3i64_v4i64__6_0_0() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__6_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_0_0() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_1: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_1: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_0_1() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_1: +define void @s_shuffle_v3i64_v4i64__7_u_0() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_1: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_1: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_2_1() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_1: +define void @s_shuffle_v3i64_v4i64__7_1_0() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_1: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_1: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_3_1() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_1: +define void @s_shuffle_v3i64_v4i64__7_2_0() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -10553,16 +8967,16 @@ define void @s_shuffle_v3i64_v4i64__7_3_1() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_1: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -10571,63 +8985,84 @@ define void @s_shuffle_v3i64_v4i64__7_3_1() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_3_1: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s16 +; GFX942-NEXT: s_mov_b32 s11, s17 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_4_1() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_1: +define void @s_shuffle_v3i64_v4i64__7_3_0() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_3_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_4_0() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_1: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -10636,45 +9071,41 @@ define void @s_shuffle_v3i64_v4i64__7_4_1() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_1: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_5_1() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_1: +define void @s_shuffle_v3i64_v4i64__7_5_0() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -10685,14 +9116,14 @@ define void @s_shuffle_v3i64_v4i64__7_5_1() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_1: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -10703,14 +9134,14 @@ define void @s_shuffle_v3i64_v4i64__7_5_1() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_1: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -10721,126 +9152,120 @@ define void @s_shuffle_v3i64_v4i64__7_5_1() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s14 ; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s12, s0 +; GFX942-NEXT: s_mov_b32 s13, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_6_1() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_1: +define void @s_shuffle_v3i64_v4i64__7_6_0() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_1: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_1: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__u_2_2() { -; GFX9-LABEL: s_shuffle_v3i64_v4i64__u_2_2: +define void @s_shuffle_v3i64_v4i64__u_1_1() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__u_1_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:15] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__0_2_2() { -; GFX9-LABEL: s_shuffle_v3i64_v4i64__0_2_2: +define void @s_shuffle_v3i64_v4i64__0_1_1() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__0_1_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:15] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__1_2_2() { -; GFX9-LABEL: s_shuffle_v3i64_v4i64__1_2_2: +define void @s_shuffle_v3i64_v4i64__1_1_1() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__1_1_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART @@ -10848,41 +9273,41 @@ define void @s_shuffle_v3i64_v4i64__1_2_2() { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__2_2_2() { -; GFX9-LABEL: s_shuffle_v3i64_v4i64__2_2_2: +define void @s_shuffle_v3i64_v4i64__2_1_1() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__2_1_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__3_2_2() { -; GFX9-LABEL: s_shuffle_v3i64_v4i64__3_2_2: +define void @s_shuffle_v3i64_v4i64__3_1_1() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__3_1_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART @@ -10890,75 +9315,75 @@ define void @s_shuffle_v3i64_v4i64__3_2_2() { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_mov_b32 s8, s14 ; GFX9-NEXT: s_mov_b32 s9, s15 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__4_2_2() { -; GFX9-LABEL: s_shuffle_v3i64_v4i64__4_2_2: +define void @s_shuffle_v3i64_v4i64__4_1_1() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__4_1_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:15] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__5_2_2() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_2_2: +define void @s_shuffle_v3i64_v4i64__5_1_1() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_2_2: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_2_2: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -10969,171 +9394,118 @@ define void @s_shuffle_v3i64_v4i64__5_2_2() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s2 ; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v4i64__6_2_2() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_2_2() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_2: +define void @s_shuffle_v3i64_v4i64__6_1_1() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_2: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_2: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s10, s2 +; GFX942-NEXT: s_mov_b32 s11, s3 +; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_u_2() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_2: +define void @s_shuffle_v3i64_v4i64__7_1_1() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_2: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_2: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -11144,177 +9516,189 @@ define void @s_shuffle_v3i64_v4i64__7_u_2() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_0_2() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_2: +define void @s_shuffle_v3i64_v4i64__7_u_1() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_2: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_2: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_1_2() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_2: +define void @s_shuffle_v3i64_v4i64__7_0_1() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_2: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_2: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_3_2() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_2: +define void @s_shuffle_v3i64_v4i64__7_2_1() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_2: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_3_2: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -11323,120 +9707,153 @@ define void @s_shuffle_v3i64_v4i64__7_3_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_4_2() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_2: +define void @s_shuffle_v3i64_v4i64__7_3_1() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_3_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_4_1() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_2: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_2: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_5_2() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_2: +define void @s_shuffle_v3i64_v4i64__7_5_1() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_2: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_2: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -11447,57 +9864,61 @@ define void @s_shuffle_v3i64_v4i64__7_5_2() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s14 ; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 +; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_6_2() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_2: +define void @s_shuffle_v3i64_v4i64__7_6_1() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_2: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_2: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -11506,357 +9927,418 @@ define void @s_shuffle_v3i64_v4i64__7_6_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__u_3_3() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__u_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__u_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__u_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v3i64_v4i64__u_2_2() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__u_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__0_3_3() { -; GFX9-LABEL: s_shuffle_v3i64_v4i64__0_3_3: +define void @s_shuffle_v3i64_v4i64__0_2_2() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__0_2_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:15] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 -; GFX9-NEXT: s_mov_b32 s12, s14 -; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__1_3_3() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_3_3: +define void @s_shuffle_v3i64_v4i64__1_2_2() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__1_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__2_2_2() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__3_2_2() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__3_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__4_2_2() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__4_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__5_2_2() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_3_3: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__1_3_3: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s2 ; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__2_3_3() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_3_3: +define void @s_shuffle_v3i64_v4i64__6_2_2() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_3_3: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__2_3_3: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__3_3_3() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_3_3: +define void @s_shuffle_v3i64_v4i64__7_2_2() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_3_3: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__3_3_3: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__4_3_3() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__4_3_3: +define void @s_shuffle_v3i64_v4i64__7_u_2() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__4_3_3: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__4_3_3: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__5_3_3() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_3_3: +define void @s_shuffle_v3i64_v4i64__7_0_2() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_3_3: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_3_3: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -11865,61 +10347,57 @@ define void @s_shuffle_v3i64_v4i64__5_3_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__6_3_3() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_3_3: +define void @s_shuffle_v3i64_v4i64__7_1_2() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_3_3: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_3_3: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -11928,25 +10406,21 @@ define void @s_shuffle_v3i64_v4i64__6_3_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_3_3() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_3: +define void @s_shuffle_v3i64_v4i64__7_3_2() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -11959,14 +10433,12 @@ define void @s_shuffle_v3i64_v4i64__7_3_3() { ; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: s_mov_b32 s10, s14 ; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_3: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -11979,77 +10451,73 @@ define void @s_shuffle_v3i64_v4i64__7_3_3() { ; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: s_mov_b32 s10, s14 ; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_3_3: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_u_3() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_3: +define void @s_shuffle_v3i64_v4i64__7_4_2() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_3: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_3: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -12058,63 +10526,59 @@ define void @s_shuffle_v3i64_v4i64__7_u_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_0_3() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_3: +define void @s_shuffle_v3i64_v4i64__7_5_2() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s18 -; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_3: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s18 -; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_3: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -12125,23 +10589,21 @@ define void @s_shuffle_v3i64_v4i64__7_0_3() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s14 ; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_1_3() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_3: +define void @s_shuffle_v3i64_v4i64__7_6_2() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -12152,14 +10614,14 @@ define void @s_shuffle_v3i64_v4i64__7_1_3() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s22 ; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_3: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -12170,14 +10632,14 @@ define void @s_shuffle_v3i64_v4i64__7_1_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s22 ; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_3: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -12188,88 +10650,202 @@ define void @s_shuffle_v3i64_v4i64__7_1_3() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_2_3() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_3: +define void @s_shuffle_v3i64_v4i64__u_3_3() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__u_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__0_3_3() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__0_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s14 +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__1_3_3() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__1_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__2_3_3() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__2_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__3_3_3() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__4_3_3() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__4_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__5_3_3() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_3: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_3: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s10 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s13, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_4_3() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_3: +define void @s_shuffle_v3i64_v4i64__6_3_3() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -12278,10 +10854,8 @@ define void @s_shuffle_v3i64_v4i64__7_4_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 ; GFX900-NEXT: s_mov_b32 s12, s14 ; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART @@ -12289,7 +10863,7 @@ define void @s_shuffle_v3i64_v4i64__7_4_3() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_3: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -12298,10 +10872,8 @@ define void @s_shuffle_v3i64_v4i64__7_4_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 ; GFX90A-NEXT: s_mov_b32 s12, s14 ; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART @@ -12309,135 +10881,129 @@ define void @s_shuffle_v3i64_v4i64__7_4_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_3: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:19] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s14 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s18 -; GFX942-NEXT: s_mov_b32 s9, s19 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_5_3() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_3: +define void @s_shuffle_v3i64_v4i64__7_3_3() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s18 -; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_3: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s18 -; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_3: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s10 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s13, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_6_3() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_3: +define void @s_shuffle_v3i64_v4i64__7_u_3() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_3: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_3: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -12448,8 +11014,6 @@ define void @s_shuffle_v3i64_v4i64__7_6_3() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s14 ; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: s_mov_b32 s12, s6 ; GFX942-NEXT: s_mov_b32 s13, s7 ; GFX942-NEXT: ;;#ASMSTART @@ -12458,620 +11022,687 @@ define void @s_shuffle_v3i64_v4i64__7_6_3() { ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v4i64__u_4_4() { -; GFX9-LABEL: s_shuffle_v3i64_v4i64__u_4_4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__0_4_4() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_4_4: +define void @s_shuffle_v3i64_v4i64__7_0_3() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_4_4: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__0_4_4: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v4i64__1_4_4() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_4_4: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_4_4: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__1_4_4: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__2_4_4() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_4_4: +define void @s_shuffle_v3i64_v4i64__7_1_3() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_4_4: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__2_4_4: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__3_4_4() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_4_4: +define void @s_shuffle_v3i64_v4i64__7_2_3() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_4_4: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__3_4_4: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v4i64__4_4_4() { -; GFX9-LABEL: s_shuffle_v3i64_v4i64__4_4_4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__5_4_4() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_4_4: +define void @s_shuffle_v3i64_v4i64__7_4_3() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_4_4: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_4_4: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__6_4_4() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_4_4: +define void @s_shuffle_v3i64_v4i64__7_5_3() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_4_4: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_4_4: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_4_4() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_4: +define void @s_shuffle_v3i64_v4i64__7_6_3() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_4: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_4: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_u_4() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_4: +define void @s_shuffle_v3i64_v4i64__u_4_4() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__u_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__0_4_4() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_4: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_4: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__0_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_0_4() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_4: +define void @s_shuffle_v3i64_v4i64__1_4_4() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_4: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_4: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__1_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_1_4() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_4: +define void @s_shuffle_v3i64_v4i64__2_4_4() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_4: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_4: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__2_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_2_4() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_4: +define void @s_shuffle_v3i64_v4i64__3_4_4() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_4: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_4: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__3_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:19] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s18 -; GFX942-NEXT: s_mov_b32 s9, s19 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__4_4_4() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__5_4_4() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__5_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_3_4() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_4: +define void @s_shuffle_v3i64_v4i64__6_4_4() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__6_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_4_4() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_u_4() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_u_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_0_4() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -13082,12 +11713,14 @@ define void @s_shuffle_v3i64_v4i64__7_3_4() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s18 ; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_4: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -13098,12 +11731,14 @@ define void @s_shuffle_v3i64_v4i64__7_3_4() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s18 ; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_3_4: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -13114,124 +11749,196 @@ define void @s_shuffle_v3i64_v4i64__7_3_4() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s18 ; GFX942-NEXT: s_mov_b32 s9, s19 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_5_4() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_4: +define void @s_shuffle_v3i64_v4i64__7_1_4() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s6 -; GFX900-NEXT: s_mov_b32 s11, s7 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_4: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s6 -; GFX90A-NEXT: s_mov_b32 s11, s7 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_4: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s2 -; GFX942-NEXT: s_mov_b32 s11, s3 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) ret void } -define void @s_shuffle_v3i64_v4i64__7_6_4() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_4: +define void @s_shuffle_v3i64_v4i64__7_2_4() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s18 ; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_4: +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_4: +; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 ; GFX942-NEXT: s_mov_b32 s10, s4 ; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_3_4() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_3_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_5_4() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_5_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s10, s14 +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_6_4() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_6_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s10, s16 +; GFX9-NEXT: s_mov_b32 s11, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> @@ -13386,15 +12093,15 @@ define void @s_shuffle_v3i64_v4i64__2_5_5() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -13404,15 +12111,15 @@ define void @s_shuffle_v3i64_v4i64__2_5_5() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -13422,15 +12129,16 @@ define void @s_shuffle_v3i64_v4i64__2_5_5() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s10, s2 +; GFX942-NEXT: s_mov_b32 s11, s3 +; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -13550,12 +12258,12 @@ define void @s_shuffle_v3i64_v4i64__6_5_5() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND @@ -13878,17 +12586,16 @@ define void @s_shuffle_v3i64_v4i64__7_3_5() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:19] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s18 -; GFX942-NEXT: s_mov_b32 s9, s19 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -14163,15 +12870,15 @@ define void @s_shuffle_v3i64_v4i64__2_6_6() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -14181,15 +12888,15 @@ define void @s_shuffle_v3i64_v4i64__2_6_6() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -14199,15 +12906,16 @@ define void @s_shuffle_v3i64_v4i64__2_6_6() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -14327,12 +13035,12 @@ define void @s_shuffle_v3i64_v4i64__6_6_6() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND @@ -14574,15 +13282,15 @@ define void @s_shuffle_v3i64_v4i64__7_3_6() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -14592,15 +13300,15 @@ define void @s_shuffle_v3i64_v4i64__7_3_6() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -14610,15 +13318,16 @@ define void @s_shuffle_v3i64_v4i64__7_3_6() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -14671,88 +13380,56 @@ define void @s_shuffle_v3i64_v4i64__7_4_6() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v4i64__7_5_6() { -; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_5_6: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s14 -; GFX9-NEXT: s_mov_b32 s9, s15 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) - ret void -} - -define void @s_shuffle_v3i64_v4i64__u_7_7() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__u_7_7: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__u_7_7: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__u_7_7: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_5_6() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_5_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__u_7_7() { +; GFX9-LABEL: s_shuffle_v3i64_v4i64__u_7_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> @@ -14826,17 +13503,15 @@ define void @s_shuffle_v3i64_v4i64__1_7_7() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -14846,17 +13521,15 @@ define void @s_shuffle_v3i64_v4i64__1_7_7() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -14866,17 +13539,16 @@ define void @s_shuffle_v3i64_v4i64__1_7_7() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s2 ; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -14929,17 +13601,16 @@ define void @s_shuffle_v3i64_v4i64__2_7_7() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s10, s6 +; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -14956,17 +13627,15 @@ define void @s_shuffle_v3i64_v4i64__3_7_7() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s12, s18 -; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -14976,17 +13645,15 @@ define void @s_shuffle_v3i64_v4i64__3_7_7() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s12, s18 -; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -14998,15 +13665,14 @@ define void @s_shuffle_v3i64_v4i64__3_7_7() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -15041,56 +13707,20 @@ define void @s_shuffle_v3i64_v4i64__4_7_7() { } define void @s_shuffle_v3i64_v4i64__5_7_7() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_7_7: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_7_7: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_7_7: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i64_v4i64__5_7_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> @@ -15099,56 +13729,18 @@ define void @s_shuffle_v3i64_v4i64__5_7_7() { } define void @s_shuffle_v3i64_v4i64__6_7_7() { -; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_7_7: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_7_7: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_7_7: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i64_v4i64__6_7_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> @@ -15444,17 +14036,16 @@ define void @s_shuffle_v3i64_v4i64__7_3_7() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll index 8757639c501d2..ff8ddd031858f 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll @@ -58,39 +58,33 @@ define void @v_shuffle_v3p0_v2p0__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v2p0__1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__1_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__1_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -114,39 +108,33 @@ define void @v_shuffle_v3p0_v2p0__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -160,55 +148,42 @@ define void @v_shuffle_v3p0_v2p0__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_0_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_0_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -222,49 +197,43 @@ define void @v_shuffle_v3p0_v2p0__3_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_1_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_1_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -291,31 +260,27 @@ define void @v_shuffle_v3p0_v2p0__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -329,39 +294,40 @@ define void @v_shuffle_v3p0_v2p0__3_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -375,51 +341,51 @@ define void @v_shuffle_v3p0_v2p0__3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_3_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_3_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -433,51 +399,52 @@ define void @v_shuffle_v3p0_v2p0__3_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_3_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_3_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 +; GFX942-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -491,42 +458,42 @@ define void @v_shuffle_v3p0_v2p0__3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_3_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -540,42 +507,42 @@ define void @v_shuffle_v3p0_v2p0__3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -589,42 +556,36 @@ define void @v_shuffle_v3p0_v2p0__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v2p0__u_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -699,32 +660,28 @@ define void @v_shuffle_v3p0_v2p0__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -737,42 +694,36 @@ define void @v_shuffle_v3p0_v2p0__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v2p0__2_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -785,57 +736,45 @@ define void @v_shuffle_v3p0_v2p0__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -849,52 +788,45 @@ define void @v_shuffle_v3p0_v2p0__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -910,15 +842,15 @@ define void @v_shuffle_v3p0_v2p0__3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -927,15 +859,15 @@ define void @v_shuffle_v3p0_v2p0__3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -944,15 +876,15 @@ define void @v_shuffle_v3p0_v2p0__3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v7 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -966,52 +898,51 @@ define void @v_shuffle_v3p0_v2p0__3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1109,42 +1040,42 @@ define void @v_shuffle_v3p0_v2p0__1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v2p0__1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__1_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__1_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1200,16 +1131,14 @@ define void @v_shuffle_v3p0_v2p0__3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1217,16 +1146,14 @@ define void @v_shuffle_v3p0_v2p0__3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1234,16 +1161,14 @@ define void @v_shuffle_v3p0_v2p0__3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1258,16 +1183,14 @@ define void @v_shuffle_v3p0_v2p0__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1275,16 +1198,14 @@ define void @v_shuffle_v3p0_v2p0__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1292,16 +1213,14 @@ define void @v_shuffle_v3p0_v2p0__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1317,16 +1236,12 @@ define void @v_shuffle_v3p0_v2p0__3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1336,16 +1251,12 @@ define void @v_shuffle_v3p0_v2p0__3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1355,16 +1266,12 @@ define void @v_shuffle_v3p0_v2p0__3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -1379,57 +1286,52 @@ define void @v_shuffle_v3p0_v2p0__3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_2_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1493,39 +1395,33 @@ define void @v_shuffle_v3p0_v2p0__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v2p0__1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__1_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__1_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1563,32 +1459,28 @@ define void @v_shuffle_v3p0_v2p0__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1607,9 +1499,7 @@ define void @v_shuffle_v3p0_v2p0__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1621,9 +1511,7 @@ define void @v_shuffle_v3p0_v2p0__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1635,9 +1523,7 @@ define void @v_shuffle_v3p0_v2p0__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1651,57 +1537,45 @@ define void @v_shuffle_v3p0_v2p0__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_0_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_0_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_0_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1716,16 +1590,14 @@ define void @v_shuffle_v3p0_v2p0__3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1733,16 +1605,14 @@ define void @v_shuffle_v3p0_v2p0__3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1750,16 +1620,14 @@ define void @v_shuffle_v3p0_v2p0__3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1878,13 +1746,13 @@ define void @v_shuffle_v3p0_v2p0__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1895,13 +1763,13 @@ define void @v_shuffle_v3p0_v2p0__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1912,13 +1780,13 @@ define void @v_shuffle_v3p0_v2p0__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1975,42 +1843,36 @@ define void @v_shuffle_v3p0_v2p0__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2024,57 +1886,45 @@ define void @v_shuffle_v3p0_v2p0__3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_0_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_0_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_0_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2089,16 +1939,14 @@ define void @v_shuffle_v3p0_v2p0__3_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2106,16 +1954,14 @@ define void @v_shuffle_v3p0_v2p0__3_1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2123,16 +1969,14 @@ define void @v_shuffle_v3p0_v2p0__3_1_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2160,32 +2004,28 @@ define void @v_shuffle_v3p0_v2p0__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2543,59 +2383,21 @@ define void @s_shuffle_v3p0_v2p0__3_3_u() { } define void @s_shuffle_v3p0_v2p0__3_3_0() { -; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_3_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_3_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v2p0__3_3_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:3] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p0_v2p0__3_3_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=s"() %vec1 = call <2 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> @@ -2665,56 +2467,20 @@ define void @s_shuffle_v3p0_v2p0__3_3_1() { } define void @s_shuffle_v3p0_v2p0__3_3_2() { -; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_3_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s6 -; GFX900-NEXT: s_mov_b32 s11, s7 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_3_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s6 -; GFX90A-NEXT: s_mov_b32 s11, s7 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v2p0__3_3_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:3] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s2 -; GFX942-NEXT: s_mov_b32 s11, s3 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p0_v2p0__3_3_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s10, s14 +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=s"() %vec1 = call <2 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> @@ -2745,50 +2511,18 @@ define void @s_shuffle_v3p0_v2p0__3_3_3() { } define void @s_shuffle_v3p0_v2p0__u_0_0() { -; GFX900-LABEL: s_shuffle_v3p0_v2p0__u_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v2p0__u_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v2p0__u_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:3] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p0_v2p0__u_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) @@ -2817,56 +2551,20 @@ define void @s_shuffle_v3p0_v2p0__0_0_0() { } define void @s_shuffle_v3p0_v2p0__1_0_0() { -; GFX900-LABEL: s_shuffle_v3p0_v2p0__1_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v2p0__1_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v2p0__1_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:3] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p0_v2p0__1_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) @@ -2874,50 +2572,18 @@ define void @s_shuffle_v3p0_v2p0__1_0_0() { } define void @s_shuffle_v3p0_v2p0__2_0_0() { -; GFX900-LABEL: s_shuffle_v3p0_v2p0__2_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v2p0__2_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v2p0__2_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:3] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p0_v2p0__2_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) @@ -2929,17 +2595,15 @@ define void @s_shuffle_v3p0_v2p0__3_0_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -2949,17 +2613,15 @@ define void @s_shuffle_v3p0_v2p0__3_0_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -2969,17 +2631,15 @@ define void @s_shuffle_v3p0_v2p0__3_0_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:3] +; GFX942-NEXT: ; def s[12:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:7] +; GFX942-NEXT: ; def s[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -2996,15 +2656,13 @@ define void @s_shuffle_v3p0_v2p0__3_u_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -3014,15 +2672,13 @@ define void @s_shuffle_v3p0_v2p0__3_u_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -3032,15 +2688,13 @@ define void @s_shuffle_v3p0_v2p0__3_u_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:3] +; GFX942-NEXT: ; def s[12:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:7] +; GFX942-NEXT: ; def s[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -3057,17 +2711,15 @@ define void @s_shuffle_v3p0_v2p0__3_1_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s6 -; GFX900-NEXT: s_mov_b32 s11, s7 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -3077,17 +2729,15 @@ define void @s_shuffle_v3p0_v2p0__3_1_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s6 -; GFX90A-NEXT: s_mov_b32 s11, s7 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -3097,17 +2747,15 @@ define void @s_shuffle_v3p0_v2p0__3_1_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:3] +; GFX942-NEXT: ; def s[12:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:7] +; GFX942-NEXT: ; def s[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s2 -; GFX942-NEXT: s_mov_b32 s11, s3 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -3129,12 +2777,10 @@ define void @s_shuffle_v3p0_v2p0__3_2_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -3149,12 +2795,10 @@ define void @s_shuffle_v3p0_v2p0__3_2_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -3164,17 +2808,15 @@ define void @s_shuffle_v3p0_v2p0__3_2_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:3] +; GFX942-NEXT: ; def s[12:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:7] +; GFX942-NEXT: ; def s[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -3634,56 +3276,20 @@ define void @s_shuffle_v3p0_v2p0__2_2_2() { } define void @s_shuffle_v3p0_v2p0__3_2_2() { -; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v2p0__3_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:3] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p0_v2p0__3_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=s"() %vec1 = call <2 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> @@ -3692,50 +3298,18 @@ define void @s_shuffle_v3p0_v2p0__3_2_2() { } define void @s_shuffle_v3p0_v2p0__3_u_2() { -; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_u_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_u_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v2p0__3_u_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:3] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p0_v2p0__3_u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=s"() %vec1 = call <2 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> @@ -3787,14 +3361,12 @@ define void @s_shuffle_v3p0_v2p0__3_0_2() { ; GFX942-NEXT: ; def s[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:7] +; GFX942-NEXT: ; def s[12:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 ; GFX942-NEXT: s_mov_b32 s10, s0 ; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -3807,59 +3379,21 @@ define void @s_shuffle_v3p0_v2p0__3_0_2() { } define void @s_shuffle_v3p0_v2p0__3_1_2() { -; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_1_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_1_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v2p0__3_1_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:3] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p0_v2p0__3_1_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=s"() %vec1 = call <2 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll index b6f4e3091b61f..8e0c74dedb69c 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll @@ -100,39 +100,33 @@ define void @v_shuffle_v3p0_v3p0__2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__2_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -196,39 +190,33 @@ define void @v_shuffle_v3p0_v3p0__5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -242,55 +230,42 @@ define void @v_shuffle_v3p0_v3p0__5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_0_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_0_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -304,49 +279,43 @@ define void @v_shuffle_v3p0_v3p0__5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_1_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_1_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -360,49 +329,43 @@ define void @v_shuffle_v3p0_v3p0__5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -416,45 +379,40 @@ define void @v_shuffle_v3p0_v3p0__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -468,39 +426,40 @@ define void @v_shuffle_v3p0_v3p0__5_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -514,39 +473,40 @@ define void @v_shuffle_v3p0_v3p0__5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -560,51 +520,51 @@ define void @v_shuffle_v3p0_v3p0__5_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -618,51 +578,52 @@ define void @v_shuffle_v3p0_v3p0__5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_5_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_5_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_5_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -676,51 +637,52 @@ define void @v_shuffle_v3p0_v3p0__5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_5_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_5_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_5_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 +; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -734,42 +696,42 @@ define void @v_shuffle_v3p0_v3p0__5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_5_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -783,42 +745,42 @@ define void @v_shuffle_v3p0_v3p0__5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -832,42 +794,42 @@ define void @v_shuffle_v3p0_v3p0__5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -881,42 +843,36 @@ define void @v_shuffle_v3p0_v3p0__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__u_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1025,48 +981,42 @@ define void @v_shuffle_v3p0_v3p0__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__2_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1079,42 +1029,36 @@ define void @v_shuffle_v3p0_v3p0__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__3_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__3_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__3_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1185,57 +1129,45 @@ define void @v_shuffle_v3p0_v3p0__5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1249,52 +1181,45 @@ define void @v_shuffle_v3p0_v3p0__5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1308,52 +1233,51 @@ define void @v_shuffle_v3p0_v3p0__5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_1_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v8 +; GFX942-NEXT: v_mov_b32_e32 v7, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1367,51 +1291,51 @@ define void @v_shuffle_v3p0_v3p0__5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v10 +; GFX942-NEXT: v_mov_b32_e32 v7, v11 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1425,52 +1349,51 @@ define void @v_shuffle_v3p0_v3p0__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_3_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_3_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1484,51 +1407,51 @@ define void @v_shuffle_v3p0_v3p0__5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_4_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_4_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1674,42 +1597,42 @@ define void @v_shuffle_v3p0_v3p0__2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__2_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__2_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__2_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1823,16 +1746,14 @@ define void @v_shuffle_v3p0_v3p0__5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1840,16 +1761,14 @@ define void @v_shuffle_v3p0_v3p0__5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1857,16 +1776,14 @@ define void @v_shuffle_v3p0_v3p0__5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1881,16 +1798,14 @@ define void @v_shuffle_v3p0_v3p0__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1898,16 +1813,14 @@ define void @v_shuffle_v3p0_v3p0__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1915,16 +1828,14 @@ define void @v_shuffle_v3p0_v3p0__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1938,57 +1849,45 @@ define void @v_shuffle_v3p0_v3p0__5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_0_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_0_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -2002,51 +1901,51 @@ define void @v_shuffle_v3p0_v3p0__5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_2_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v8 +; GFX942-NEXT: v_mov_b32_e32 v7, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -2062,18 +1961,16 @@ define void @v_shuffle_v3p0_v3p0__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] -; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_3_1: @@ -2081,17 +1978,15 @@ define void @v_shuffle_v3p0_v3p0__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2100,18 +1995,16 @@ define void @v_shuffle_v3p0_v3p0__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -2125,16 +2018,16 @@ define void @v_shuffle_v3p0_v3p0__5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_4_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v8 -; GFX900-NEXT: v_mov_b32_e32 v5, v9 -; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 ; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -2142,16 +2035,16 @@ define void @v_shuffle_v3p0_v3p0__5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_4_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v8 -; GFX90A-NEXT: v_mov_b32_e32 v5, v9 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 ; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -2159,16 +2052,17 @@ define void @v_shuffle_v3p0_v3p0__5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_4_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v8 -; GFX942-NEXT: v_mov_b32_e32 v5, v9 +; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 ; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -2315,42 +2209,42 @@ define void @v_shuffle_v3p0_v3p0__2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -2464,16 +2358,14 @@ define void @v_shuffle_v3p0_v3p0__5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2481,16 +2373,14 @@ define void @v_shuffle_v3p0_v3p0__5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2498,16 +2388,14 @@ define void @v_shuffle_v3p0_v3p0__5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -2522,16 +2410,14 @@ define void @v_shuffle_v3p0_v3p0__5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2539,16 +2425,14 @@ define void @v_shuffle_v3p0_v3p0__5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2556,16 +2440,14 @@ define void @v_shuffle_v3p0_v3p0__5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -2581,17 +2463,13 @@ define void @v_shuffle_v3p0_v3p0__5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2600,17 +2478,13 @@ define void @v_shuffle_v3p0_v3p0__5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2619,17 +2493,13 @@ define void @v_shuffle_v3p0_v3p0__5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -2643,51 +2513,46 @@ define void @v_shuffle_v3p0_v3p0__5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_1_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_1_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -2701,57 +2566,52 @@ define void @v_shuffle_v3p0_v3p0__5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_3_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -2765,51 +2625,52 @@ define void @v_shuffle_v3p0_v3p0__5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_4_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v10 -; GFX900-NEXT: v_mov_b32_e32 v7, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_4_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v10 -; GFX90A-NEXT: v_mov_b32_e32 v7, v11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_4_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v10 -; GFX942-NEXT: v_mov_b32_e32 v7, v11 +; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -2915,39 +2776,33 @@ define void @v_shuffle_v3p0_v3p0__2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__2_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3020,48 +2875,42 @@ define void @v_shuffle_v3p0_v3p0__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3080,9 +2929,7 @@ define void @v_shuffle_v3p0_v3p0__5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3094,9 +2941,7 @@ define void @v_shuffle_v3p0_v3p0__5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3108,9 +2953,7 @@ define void @v_shuffle_v3p0_v3p0__5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3124,57 +2967,45 @@ define void @v_shuffle_v3p0_v3p0__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_0_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_0_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_0_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3189,16 +3020,14 @@ define void @v_shuffle_v3p0_v3p0__5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3206,16 +3035,14 @@ define void @v_shuffle_v3p0_v3p0__5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3223,16 +3050,14 @@ define void @v_shuffle_v3p0_v3p0__5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3247,16 +3072,14 @@ define void @v_shuffle_v3p0_v3p0__5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3264,16 +3087,14 @@ define void @v_shuffle_v3p0_v3p0__5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3281,16 +3102,14 @@ define void @v_shuffle_v3p0_v3p0__5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3304,42 +3123,42 @@ define void @v_shuffle_v3p0_v3p0__5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_4_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_4_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3516,13 +3335,13 @@ define void @v_shuffle_v3p0_v3p0__2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 ; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3533,13 +3352,13 @@ define void @v_shuffle_v3p0_v3p0__2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3550,13 +3369,13 @@ define void @v_shuffle_v3p0_v3p0__2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v8 +; GFX942-NEXT: v_mov_b32_e32 v7, v9 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3662,42 +3481,42 @@ define void @v_shuffle_v3p0_v3p0__5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3711,42 +3530,36 @@ define void @v_shuffle_v3p0_v3p0__5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 ; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3760,57 +3573,45 @@ define void @v_shuffle_v3p0_v3p0__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_0_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_0_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_0_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3825,16 +3626,14 @@ define void @v_shuffle_v3p0_v3p0__5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3842,16 +3641,14 @@ define void @v_shuffle_v3p0_v3p0__5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3859,16 +3656,14 @@ define void @v_shuffle_v3p0_v3p0__5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3883,16 +3678,14 @@ define void @v_shuffle_v3p0_v3p0__5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3900,16 +3693,14 @@ define void @v_shuffle_v3p0_v3p0__5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3917,16 +3708,14 @@ define void @v_shuffle_v3p0_v3p0__5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3940,48 +3729,42 @@ define void @v_shuffle_v3p0_v3p0__5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -4158,13 +3941,13 @@ define void @v_shuffle_v3p0_v3p0__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 ; GFX900-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4175,13 +3958,13 @@ define void @v_shuffle_v3p0_v3p0__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 ; GFX90A-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4192,13 +3975,13 @@ define void @v_shuffle_v3p0_v3p0__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v10 +; GFX942-NEXT: v_mov_b32_e32 v7, v11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -4304,42 +4087,36 @@ define void @v_shuffle_v3p0_v3p0__5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_u_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_u_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 ; GFX942-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -4353,57 +4130,45 @@ define void @v_shuffle_v3p0_v3p0__5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_0_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_0_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_0_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -4418,16 +4183,14 @@ define void @v_shuffle_v3p0_v3p0__5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4435,16 +4198,14 @@ define void @v_shuffle_v3p0_v3p0__5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4452,16 +4213,14 @@ define void @v_shuffle_v3p0_v3p0__5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -4476,16 +4235,14 @@ define void @v_shuffle_v3p0_v3p0__5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4493,16 +4250,14 @@ define void @v_shuffle_v3p0_v3p0__5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4510,16 +4265,14 @@ define void @v_shuffle_v3p0_v3p0__5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -4533,48 +4286,42 @@ define void @v_shuffle_v3p0_v3p0__5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_3_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_3_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_3_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -4588,42 +4335,42 @@ define void @v_shuffle_v3p0_v3p0__5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -4759,10 +4506,9 @@ define void @s_shuffle_v3p0_v3p0__2_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -4860,10 +4606,9 @@ define void @s_shuffle_v3p0_v3p0__5_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -4880,15 +4625,13 @@ define void @s_shuffle_v3p0_v3p0__5_0_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -4898,15 +4641,13 @@ define void @s_shuffle_v3p0_v3p0__5_0_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -4969,11 +4710,11 @@ define void @s_shuffle_v3p0_v3p0__5_1_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -5024,13 +4765,11 @@ define void @s_shuffle_v3p0_v3p0__5_2_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -5043,46 +4782,18 @@ define void @s_shuffle_v3p0_v3p0__5_2_u() { } define void @s_shuffle_v3p0_v3p0__5_3_u() { -; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_3_u: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_3_u: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_3_u: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_3_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> @@ -5095,10 +4806,10 @@ define void @s_shuffle_v3p0_v3p0__5_4_u() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND @@ -5111,50 +4822,18 @@ define void @s_shuffle_v3p0_v3p0__5_4_u() { } define void @s_shuffle_v3p0_v3p0__5_5_u() { -; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_5_u: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_5_u: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_5_u: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_5_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> @@ -5163,65 +4842,21 @@ define void @s_shuffle_v3p0_v3p0__5_5_u() { } define void @s_shuffle_v3p0_v3p0__5_5_0() { -; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_5_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_5_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_5_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_5_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> @@ -5234,17 +4869,15 @@ define void @s_shuffle_v3p0_v3p0__5_5_1() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -5254,17 +4887,15 @@ define void @s_shuffle_v3p0_v3p0__5_5_1() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -5274,16 +4905,14 @@ define void @s_shuffle_v3p0_v3p0__5_5_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 ; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] @@ -5304,12 +4933,10 @@ define void @s_shuffle_v3p0_v3p0__5_5_2() { ; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -5322,12 +4949,10 @@ define void @s_shuffle_v3p0_v3p0__5_5_2() { ; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -5339,13 +4964,12 @@ define void @s_shuffle_v3p0_v3p0__5_5_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -5358,52 +4982,20 @@ define void @s_shuffle_v3p0_v3p0__5_5_2() { } define void @s_shuffle_v3p0_v3p0__5_5_3() { -; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_5_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_5_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_5_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_5_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> @@ -5412,74 +5004,38 @@ define void @s_shuffle_v3p0_v3p0__5_5_3() { } define void @s_shuffle_v3p0_v3p0__5_5_4() { -; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_5_4: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_5_4: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_5_4: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - +; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_5_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + define void @s_shuffle_v3p0_v3p0__5_5_5() { ; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_5_5: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND @@ -5492,50 +5048,18 @@ define void @s_shuffle_v3p0_v3p0__5_5_5() { } define void @s_shuffle_v3p0_v3p0__u_0_0() { -; GFX900-LABEL: s_shuffle_v3p0_v3p0__u_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v3p0__u_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v3p0__u_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p0_v3p0__u_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) @@ -5564,56 +5088,20 @@ define void @s_shuffle_v3p0_v3p0__0_0_0() { } define void @s_shuffle_v3p0_v3p0__1_0_0() { -; GFX900-LABEL: s_shuffle_v3p0_v3p0__1_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v3p0__1_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v3p0__1_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p0_v3p0__1_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) @@ -5621,52 +5109,20 @@ define void @s_shuffle_v3p0_v3p0__1_0_0() { } define void @s_shuffle_v3p0_v3p0__2_0_0() { -; GFX900-LABEL: s_shuffle_v3p0_v3p0__2_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v3p0__2_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v3p0__2_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p0_v3p0__2_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) @@ -5674,50 +5130,18 @@ define void @s_shuffle_v3p0_v3p0__2_0_0() { } define void @s_shuffle_v3p0_v3p0__3_0_0() { -; GFX900-LABEL: s_shuffle_v3p0_v3p0__3_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v3p0__3_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v3p0__3_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p0_v3p0__3_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) @@ -5732,14 +5156,12 @@ define void @s_shuffle_v3p0_v3p0__4_0_0() { ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -5752,14 +5174,12 @@ define void @s_shuffle_v3p0_v3p0__4_0_0() { ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -5769,17 +5189,15 @@ define void @s_shuffle_v3p0_v3p0__4_0_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -5792,63 +5210,21 @@ define void @s_shuffle_v3p0_v3p0__4_0_0() { } define void @s_shuffle_v3p0_v3p0__5_0_0() { -; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> @@ -5857,307 +5233,140 @@ define void @s_shuffle_v3p0_v3p0__5_0_0() { } define void @s_shuffle_v3p0_v3p0__5_u_0() { -; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_u_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_u_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_u_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v3p0__5_1_0() { -; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_1_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s6 -; GFX900-NEXT: s_mov_b32 s11, s7 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_1_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s6 -; GFX90A-NEXT: s_mov_b32 s11, s7 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_1_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s2 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s11, s3 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v3p0__5_2_0() { -; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_2_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_2_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_2_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v3p0__5_3_0() { -; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_3_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_3_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_3_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v3p0__5_4_0() { -; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_4_0: +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_4_0: +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_4_0: +; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_1_0() { +; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_1_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s14 +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_2_0() { +; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_2_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s16 +; GFX9-NEXT: s_mov_b32 s11, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_3_0() { +; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_3_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_4_0() { +; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_4_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> @@ -6229,12 +5438,12 @@ define void @s_shuffle_v3p0_v3p0__2_1_1() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND @@ -6364,12 +5573,10 @@ define void @s_shuffle_v3p0_v3p0__5_1_1() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s10 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s10 ; GFX942-NEXT: s_mov_b32 s13, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] @@ -6387,15 +5594,13 @@ define void @s_shuffle_v3p0_v3p0__5_u_1() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -6405,15 +5610,13 @@ define void @s_shuffle_v3p0_v3p0__5_u_1() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -6446,17 +5649,15 @@ define void @s_shuffle_v3p0_v3p0__5_0_1() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -6466,17 +5667,15 @@ define void @s_shuffle_v3p0_v3p0__5_0_1() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -6507,61 +5706,23 @@ define void @s_shuffle_v3p0_v3p0__5_0_1() { } define void @s_shuffle_v3p0_v3p0__5_2_1() { -; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_2_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_2_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_2_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_2_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s16 +; GFX9-NEXT: s_mov_b32 s11, s17 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> @@ -6574,17 +5735,15 @@ define void @s_shuffle_v3p0_v3p0__5_3_1() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -6594,17 +5753,15 @@ define void @s_shuffle_v3p0_v3p0__5_3_1() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -6639,15 +5796,15 @@ define void @s_shuffle_v3p0_v3p0__5_4_1() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -6657,15 +5814,15 @@ define void @s_shuffle_v3p0_v3p0__5_4_1() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -6675,14 +5832,14 @@ define void @s_shuffle_v3p0_v3p0__5_4_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 ; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s6 +; GFX942-NEXT: s_mov_b32 s11, s7 ; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] @@ -6759,12 +5916,12 @@ define void @s_shuffle_v3p0_v3p0__2_2_2() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND @@ -6894,12 +6051,10 @@ define void @s_shuffle_v3p0_v3p0__5_2_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] @@ -6947,11 +6102,11 @@ define void @s_shuffle_v3p0_v3p0__5_u_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -6964,61 +6119,23 @@ define void @s_shuffle_v3p0_v3p0__5_u_2() { } define void @s_shuffle_v3p0_v3p0__5_0_2() { -; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_0_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_0_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_0_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_0_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s12, s16 +; GFX9-NEXT: s_mov_b32 s13, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> @@ -7061,11 +6178,11 @@ define void @s_shuffle_v3p0_v3p0__5_1_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -7116,13 +6233,12 @@ define void @s_shuffle_v3p0_v3p0__5_3_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -7139,15 +6255,13 @@ define void @s_shuffle_v3p0_v3p0__5_4_2() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -7157,15 +6271,13 @@ define void @s_shuffle_v3p0_v3p0__5_4_2() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -7177,13 +6289,12 @@ define void @s_shuffle_v3p0_v3p0__5_4_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 +; GFX942-NEXT: s_mov_b32 s10, s6 +; GFX942-NEXT: s_mov_b32 s11, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -7256,252 +6367,155 @@ define void @s_shuffle_v3p0_v3p0__1_3_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v3p0__1_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v3p0__1_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v3p0__2_3_3() { -; GFX900-LABEL: s_shuffle_v3p0_v3p0__2_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v3p0__2_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v3p0__2_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v3p0__3_3_3() { -; GFX9-LABEL: s_shuffle_v3p0_v3p0__3_3_3: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v3p0__4_3_3() { -; GFX900-LABEL: s_shuffle_v3p0_v3p0__4_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v3p0__4_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v3p0__4_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v3p0__5_3_3() { -; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_3_3: +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__1_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_3_3: +; GFX942-LABEL: s_shuffle_v3p0_v3p0__1_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v3p0__5_u_3() { -; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_u_3: +define void @s_shuffle_v3p0_v3p0__2_3_3() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__2_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_u_3: +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_u_3: +; GFX942-LABEL: s_shuffle_v3p0_v3p0__2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__3_3_3() { +; GFX9-LABEL: s_shuffle_v3p0_v3p0__3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__4_3_3() { +; GFX9-LABEL: s_shuffle_v3p0_v3p0__4_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_3_3() { +; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_u_3() { +; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_u_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> @@ -7514,15 +6528,15 @@ define void @s_shuffle_v3p0_v3p0__5_0_3() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -7532,15 +6546,15 @@ define void @s_shuffle_v3p0_v3p0__5_0_3() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -7609,13 +6623,12 @@ define void @s_shuffle_v3p0_v3p0__5_1_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -7668,15 +6681,15 @@ define void @s_shuffle_v3p0_v3p0__5_2_3() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:17] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s16 -; GFX942-NEXT: s_mov_b32 s9, s17 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -7689,52 +6702,20 @@ define void @s_shuffle_v3p0_v3p0__5_2_3() { } define void @s_shuffle_v3p0_v3p0__5_4_3() { -; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_4_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s6 -; GFX900-NEXT: s_mov_b32 s11, s7 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_4_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s6 -; GFX90A-NEXT: s_mov_b32 s11, s7 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_4_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s2 -; GFX942-NEXT: s_mov_b32 s11, s3 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_4_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> @@ -7892,12 +6873,12 @@ define void @s_shuffle_v3p0_v3p0__2_4_4() { ; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -7910,12 +6891,12 @@ define void @s_shuffle_v3p0_v3p0__2_4_4() { ; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -7925,15 +6906,16 @@ define void @s_shuffle_v3p0_v3p0__2_4_4() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s10, s2 +; GFX942-NEXT: s_mov_b32 s11, s3 +; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -7986,70 +6968,42 @@ define void @s_shuffle_v3p0_v3p0__4_4_4() { call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } - -define void @s_shuffle_v3p0_v3p0__5_4_4() { -; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_4_4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v3p0__5_u_4() { -; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_u_4: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_u_4: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_u_4: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] + +define void @s_shuffle_v3p0_v3p0__5_4_4() { +; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_u_4() { +; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_u_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> @@ -8062,17 +7016,15 @@ define void @s_shuffle_v3p0_v3p0__5_0_4() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8082,17 +7034,15 @@ define void @s_shuffle_v3p0_v3p0__5_0_4() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8161,13 +7111,12 @@ define void @s_shuffle_v3p0_v3p0__5_1_4() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -8220,17 +7169,15 @@ define void @s_shuffle_v3p0_v3p0__5_2_4() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:17] +; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s16 -; GFX942-NEXT: s_mov_b32 s9, s17 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -8243,52 +7190,20 @@ define void @s_shuffle_v3p0_v3p0__5_2_4() { } define void @s_shuffle_v3p0_v3p0__5_3_4() { -; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_3_4: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_3_4: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_3_4: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_3_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> @@ -8446,12 +7361,12 @@ define void @s_shuffle_v3p0_v3p0__2_5_5() { ; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8464,12 +7379,12 @@ define void @s_shuffle_v3p0_v3p0__2_5_5() { ; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8479,15 +7394,16 @@ define void @s_shuffle_v3p0_v3p0__2_5_5() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -8546,10 +7462,10 @@ define void @s_shuffle_v3p0_v3p0__5_u_5() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND @@ -8566,15 +7482,15 @@ define void @s_shuffle_v3p0_v3p0__5_0_5() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8584,15 +7500,15 @@ define void @s_shuffle_v3p0_v3p0__5_0_5() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8602,15 +7518,15 @@ define void @s_shuffle_v3p0_v3p0__5_0_5() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 ; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s12, s8 +; GFX942-NEXT: s_mov_b32 s13, s9 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -8630,12 +7546,10 @@ define void @s_shuffle_v3p0_v3p0__5_1_5() { ; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8648,12 +7562,10 @@ define void @s_shuffle_v3p0_v3p0__5_1_5() { ; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8665,13 +7577,12 @@ define void @s_shuffle_v3p0_v3p0__5_1_5() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 +; GFX942-NEXT: s_mov_b32 s12, s8 +; GFX942-NEXT: s_mov_b32 s13, s9 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -8688,15 +7599,15 @@ define void @s_shuffle_v3p0_v3p0__5_2_5() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8706,15 +7617,15 @@ define void @s_shuffle_v3p0_v3p0__5_2_5() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8726,13 +7637,13 @@ define void @s_shuffle_v3p0_v3p0__5_2_5() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s12, s8 +; GFX942-NEXT: s_mov_b32 s13, s9 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -8745,56 +7656,20 @@ define void @s_shuffle_v3p0_v3p0__5_2_5() { } define void @s_shuffle_v3p0_v3p0__5_3_5() { -; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_3_5: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_3_5: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_3_5: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_3_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> @@ -8807,10 +7682,12 @@ define void @s_shuffle_v3p0_v3p0__5_4_5() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll index b03066e66cf66..b60d7f80b9cc2 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll @@ -139,39 +139,33 @@ define void @v_shuffle_v3p0_v4p0__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__3_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__3_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -275,39 +269,33 @@ define void @v_shuffle_v3p0_v4p0__7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -321,55 +309,42 @@ define void @v_shuffle_v3p0_v4p0__7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_0_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_0_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -383,49 +358,43 @@ define void @v_shuffle_v3p0_v4p0__7_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_1_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_1_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -439,49 +408,43 @@ define void @v_shuffle_v3p0_v4p0__7_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -495,49 +458,43 @@ define void @v_shuffle_v3p0_v4p0__7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -551,45 +508,40 @@ define void @v_shuffle_v3p0_v4p0__7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -603,39 +555,40 @@ define void @v_shuffle_v3p0_v4p0__7_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -649,39 +602,40 @@ define void @v_shuffle_v3p0_v4p0__7_6_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_6_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_6_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_6_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -695,39 +649,40 @@ define void @v_shuffle_v3p0_v4p0__7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_7_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_7_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_7_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -741,51 +696,51 @@ define void @v_shuffle_v3p0_v4p0__7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_7_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_7_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_7_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v16, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -799,51 +754,52 @@ define void @v_shuffle_v3p0_v4p0__7_7_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_7_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_7_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_7_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -857,51 +813,52 @@ define void @v_shuffle_v3p0_v4p0__7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_7_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_7_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_7_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -915,51 +872,52 @@ define void @v_shuffle_v3p0_v4p0__7_7_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_7_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_7_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_7_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 +; GFX942-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -973,42 +931,42 @@ define void @v_shuffle_v3p0_v4p0__7_7_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_7_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_7_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_7_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1022,42 +980,42 @@ define void @v_shuffle_v3p0_v4p0__7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_7_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1071,42 +1029,42 @@ define void @v_shuffle_v3p0_v4p0__7_7_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_7_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_7_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_7_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1120,42 +1078,42 @@ define void @v_shuffle_v3p0_v4p0__7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1169,42 +1127,36 @@ define void @v_shuffle_v3p0_v4p0__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__u_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1361,48 +1313,42 @@ define void @v_shuffle_v3p0_v4p0__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__3_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__3_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__3_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1415,42 +1361,36 @@ define void @v_shuffle_v3p0_v4p0__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__4_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__4_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__4_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1579,57 +1519,45 @@ define void @v_shuffle_v3p0_v4p0__7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v16, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1643,52 +1571,45 @@ define void @v_shuffle_v3p0_v4p0__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v16, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1702,52 +1623,51 @@ define void @v_shuffle_v3p0_v4p0__7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_1_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx2 v16, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v10 +; GFX942-NEXT: v_mov_b32_e32 v9, v11 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1761,51 +1681,51 @@ define void @v_shuffle_v3p0_v4p0__7_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v12 +; GFX90A-NEXT: v_mov_b32_e32 v9, v13 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: global_store_dwordx2 v16, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v12 +; GFX942-NEXT: v_mov_b32_e32 v9, v13 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1819,51 +1739,51 @@ define void @v_shuffle_v3p0_v4p0__7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v14 +; GFX900-NEXT: v_mov_b32_e32 v9, v15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_3_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v14 +; GFX90A-NEXT: v_mov_b32_e32 v9, v15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_3_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v16, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v14 +; GFX942-NEXT: v_mov_b32_e32 v9, v15 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1877,52 +1797,51 @@ define void @v_shuffle_v3p0_v4p0__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_4_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_4_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v16, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1936,51 +1855,51 @@ define void @v_shuffle_v3p0_v4p0__7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v16, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1994,51 +1913,51 @@ define void @v_shuffle_v3p0_v4p0__7_6_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_6_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v8 -; GFX900-NEXT: v_mov_b32_e32 v5, v9 -; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_6_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v8 -; GFX90A-NEXT: v_mov_b32_e32 v5, v9 -; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_6_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v8 -; GFX942-NEXT: v_mov_b32_e32 v5, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v16, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2232,42 +2151,42 @@ define void @v_shuffle_v3p0_v4p0__3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__3_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__3_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__3_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2439,16 +2358,14 @@ define void @v_shuffle_v3p0_v4p0__7_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2456,16 +2373,14 @@ define void @v_shuffle_v3p0_v4p0__7_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2473,16 +2388,14 @@ define void @v_shuffle_v3p0_v4p0__7_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2497,16 +2410,14 @@ define void @v_shuffle_v3p0_v4p0__7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2514,16 +2425,14 @@ define void @v_shuffle_v3p0_v4p0__7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2531,16 +2440,14 @@ define void @v_shuffle_v3p0_v4p0__7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2554,57 +2461,45 @@ define void @v_shuffle_v3p0_v4p0__7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_0_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_0_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v16, v[10:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2620,15 +2515,15 @@ define void @v_shuffle_v3p0_v4p0__7_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v14, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2637,15 +2532,15 @@ define void @v_shuffle_v3p0_v4p0__7_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v14, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2654,16 +2549,15 @@ define void @v_shuffle_v3p0_v4p0__7_2_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v10 +; GFX942-NEXT: v_mov_b32_e32 v9, v11 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2677,51 +2571,51 @@ define void @v_shuffle_v3p0_v4p0__7_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx2 v16, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_3_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v12 +; GFX90A-NEXT: v_mov_b32_e32 v9, v13 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx2 v16, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_3_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v12 +; GFX942-NEXT: v_mov_b32_e32 v9, v13 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2735,58 +2629,52 @@ define void @v_shuffle_v3p0_v4p0__7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_4_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_4_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_4_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2800,51 +2688,52 @@ define void @v_shuffle_v3p0_v4p0__7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_5_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v10 -; GFX900-NEXT: v_mov_b32_e32 v5, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_5_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v10 -; GFX90A-NEXT: v_mov_b32_e32 v5, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_5_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v10 -; GFX942-NEXT: v_mov_b32_e32 v5, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2858,51 +2747,52 @@ define void @v_shuffle_v3p0_v4p0__7_6_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_6_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v10 -; GFX900-NEXT: v_mov_b32_e32 v7, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_6_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v10 -; GFX90A-NEXT: v_mov_b32_e32 v7, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_6_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v10 -; GFX942-NEXT: v_mov_b32_e32 v7, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -3096,42 +2986,42 @@ define void @v_shuffle_v3p0_v4p0__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__3_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__3_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -3303,16 +3193,14 @@ define void @v_shuffle_v3p0_v4p0__7_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3320,16 +3208,14 @@ define void @v_shuffle_v3p0_v4p0__7_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3337,16 +3223,14 @@ define void @v_shuffle_v3p0_v4p0__7_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -3361,16 +3245,14 @@ define void @v_shuffle_v3p0_v4p0__7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v12 -; GFX900-NEXT: v_mov_b32_e32 v1, v13 -; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3378,16 +3260,14 @@ define void @v_shuffle_v3p0_v4p0__7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v12 -; GFX90A-NEXT: v_mov_b32_e32 v1, v13 -; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3395,16 +3275,14 @@ define void @v_shuffle_v3p0_v4p0__7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v12 -; GFX942-NEXT: v_mov_b32_e32 v1, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -3418,57 +3296,45 @@ define void @v_shuffle_v3p0_v4p0__7_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_0_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_0_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v16, v[12:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -3482,51 +3348,46 @@ define void @v_shuffle_v3p0_v4p0__7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v12 -; GFX900-NEXT: v_mov_b32_e32 v1, v13 -; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_1_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v12 -; GFX90A-NEXT: v_mov_b32_e32 v1, v13 -; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_1_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v12 -; GFX942-NEXT: v_mov_b32_e32 v1, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx2 v14, v[10:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -3540,51 +3401,51 @@ define void @v_shuffle_v3p0_v4p0__7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_3_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v16, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v16, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v10 +; GFX942-NEXT: v_mov_b32_e32 v9, v11 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -3598,57 +3459,52 @@ define void @v_shuffle_v3p0_v4p0__7_4_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_4_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v12 -; GFX900-NEXT: v_mov_b32_e32 v1, v13 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_4_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v12 -; GFX90A-NEXT: v_mov_b32_e32 v1, v13 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_4_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v12 -; GFX942-NEXT: v_mov_b32_e32 v1, v13 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -3662,51 +3518,52 @@ define void @v_shuffle_v3p0_v4p0__7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_5_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v12 -; GFX900-NEXT: v_mov_b32_e32 v7, v13 -; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_5_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v12 -; GFX90A-NEXT: v_mov_b32_e32 v7, v13 -; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_5_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v12 -; GFX942-NEXT: v_mov_b32_e32 v7, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -3720,51 +3577,52 @@ define void @v_shuffle_v3p0_v4p0__7_6_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_6_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v12 -; GFX900-NEXT: v_mov_b32_e32 v9, v13 -; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_6_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v12 -; GFX90A-NEXT: v_mov_b32_e32 v9, v13 -; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_6_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v8, v12 -; GFX942-NEXT: v_mov_b32_e32 v9, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -3958,42 +3816,42 @@ define void @v_shuffle_v3p0_v4p0__3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -4165,16 +4023,14 @@ define void @v_shuffle_v3p0_v4p0__7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4182,16 +4038,14 @@ define void @v_shuffle_v3p0_v4p0__7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4199,16 +4053,14 @@ define void @v_shuffle_v3p0_v4p0__7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -4223,16 +4075,14 @@ define void @v_shuffle_v3p0_v4p0__7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v14 -; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4240,16 +4090,14 @@ define void @v_shuffle_v3p0_v4p0__7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v14 -; GFX90A-NEXT: v_mov_b32_e32 v1, v15 -; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4257,16 +4105,14 @@ define void @v_shuffle_v3p0_v4p0__7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v14 -; GFX942-NEXT: v_mov_b32_e32 v1, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -4280,57 +4126,45 @@ define void @v_shuffle_v3p0_v4p0__7_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_0_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v14 -; GFX90A-NEXT: v_mov_b32_e32 v3, v15 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_0_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v14 -; GFX942-NEXT: v_mov_b32_e32 v3, v15 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx2 v16, v[14:15], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -4344,51 +4178,46 @@ define void @v_shuffle_v3p0_v4p0__7_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_1_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v14 -; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_1_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v14 -; GFX90A-NEXT: v_mov_b32_e32 v1, v15 -; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_1_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v14 -; GFX942-NEXT: v_mov_b32_e32 v1, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v14, v[12:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -4402,51 +4231,46 @@ define void @v_shuffle_v3p0_v4p0__7_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v14 -; GFX900-NEXT: v_mov_b32_e32 v3, v15 -; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v14 -; GFX90A-NEXT: v_mov_b32_e32 v3, v15 -; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v14 -; GFX942-NEXT: v_mov_b32_e32 v3, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -4460,57 +4284,52 @@ define void @v_shuffle_v3p0_v4p0__7_4_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_4_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v14 -; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v14 -; GFX90A-NEXT: v_mov_b32_e32 v1, v15 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_4_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v14 -; GFX942-NEXT: v_mov_b32_e32 v1, v15 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -4524,51 +4343,52 @@ define void @v_shuffle_v3p0_v4p0__7_5_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_5_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v14 -; GFX900-NEXT: v_mov_b32_e32 v9, v15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v14 -; GFX90A-NEXT: v_mov_b32_e32 v9, v15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v14 -; GFX942-NEXT: v_mov_b32_e32 v9, v15 +; GFX942-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -4582,51 +4402,52 @@ define void @v_shuffle_v3p0_v4p0__7_6_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_6_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v14 -; GFX900-NEXT: v_mov_b32_e32 v11, v15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_6_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v14 -; GFX90A-NEXT: v_mov_b32_e32 v11, v15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_6_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v14 -; GFX942-NEXT: v_mov_b32_e32 v11, v15 +; GFX942-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -4771,39 +4592,33 @@ define void @v_shuffle_v3p0_v4p0__3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__3_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__3_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__3_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -4925,48 +4740,42 @@ define void @v_shuffle_v3p0_v4p0__7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -4985,9 +4794,7 @@ define void @v_shuffle_v3p0_v4p0__7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4999,9 +4806,7 @@ define void @v_shuffle_v3p0_v4p0__7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5013,9 +4818,7 @@ define void @v_shuffle_v3p0_v4p0__7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -5029,57 +4832,45 @@ define void @v_shuffle_v3p0_v4p0__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_0_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_0_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -5094,16 +4885,14 @@ define void @v_shuffle_v3p0_v4p0__7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5111,16 +4900,14 @@ define void @v_shuffle_v3p0_v4p0__7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5128,16 +4915,14 @@ define void @v_shuffle_v3p0_v4p0__7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -5152,16 +4937,14 @@ define void @v_shuffle_v3p0_v4p0__7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx2 v14, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5169,16 +4952,14 @@ define void @v_shuffle_v3p0_v4p0__7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx2 v14, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5186,16 +4967,14 @@ define void @v_shuffle_v3p0_v4p0__7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -5210,16 +4989,14 @@ define void @v_shuffle_v3p0_v4p0__7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5227,16 +5004,14 @@ define void @v_shuffle_v3p0_v4p0__7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5244,16 +5019,14 @@ define void @v_shuffle_v3p0_v4p0__7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[8:9], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -5267,42 +5040,42 @@ define void @v_shuffle_v3p0_v4p0__7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -5316,42 +5089,42 @@ define void @v_shuffle_v3p0_v4p0__7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_6_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_6_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_6_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -5586,13 +5359,13 @@ define void @v_shuffle_v3p0_v4p0__3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5603,13 +5376,13 @@ define void @v_shuffle_v3p0_v4p0__3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v6 -; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5620,13 +5393,13 @@ define void @v_shuffle_v3p0_v4p0__3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v10 +; GFX942-NEXT: v_mov_b32_e32 v9, v11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: global_store_dwordx2 v16, v[10:11], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v8, v6 -; GFX942-NEXT: v_mov_b32_e32 v9, v7 -; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -5781,42 +5554,42 @@ define void @v_shuffle_v3p0_v4p0__7_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -5830,42 +5603,36 @@ define void @v_shuffle_v3p0_v4p0__7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 ; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_u_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 ; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_u_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 ; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -5879,57 +5646,45 @@ define void @v_shuffle_v3p0_v4p0__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_0_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v16, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_0_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v16, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -5944,16 +5699,14 @@ define void @v_shuffle_v3p0_v4p0__7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v14, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5961,16 +5714,14 @@ define void @v_shuffle_v3p0_v4p0__7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v14, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5978,16 +5729,14 @@ define void @v_shuffle_v3p0_v4p0__7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v14, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -6002,16 +5751,14 @@ define void @v_shuffle_v3p0_v4p0__7_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6019,16 +5766,14 @@ define void @v_shuffle_v3p0_v4p0__7_2_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6036,16 +5781,14 @@ define void @v_shuffle_v3p0_v4p0__7_2_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -6060,16 +5803,14 @@ define void @v_shuffle_v3p0_v4p0__7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6077,16 +5818,14 @@ define void @v_shuffle_v3p0_v4p0__7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6094,16 +5833,14 @@ define void @v_shuffle_v3p0_v4p0__7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[10:11], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -6117,48 +5854,42 @@ define void @v_shuffle_v3p0_v4p0__7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -6172,42 +5903,42 @@ define void @v_shuffle_v3p0_v4p0__7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_6_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_6_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_6_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -6442,13 +6173,13 @@ define void @v_shuffle_v3p0_v4p0__3_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v6 -; GFX900-NEXT: v_mov_b32_e32 v11, v7 ; GFX900-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6459,13 +6190,13 @@ define void @v_shuffle_v3p0_v4p0__3_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v12 +; GFX90A-NEXT: v_mov_b32_e32 v9, v13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, v6 -; GFX90A-NEXT: v_mov_b32_e32 v11, v7 ; GFX90A-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6476,13 +6207,13 @@ define void @v_shuffle_v3p0_v4p0__3_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v12 +; GFX942-NEXT: v_mov_b32_e32 v9, v13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: global_store_dwordx2 v16, v[12:13], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v10, v6 -; GFX942-NEXT: v_mov_b32_e32 v11, v7 -; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -6637,42 +6368,42 @@ define void @v_shuffle_v3p0_v4p0__7_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_6_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_6_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_6_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -6686,42 +6417,36 @@ define void @v_shuffle_v3p0_v4p0__7_u_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_u_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 ; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_u_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 ; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_u_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 ; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -6735,57 +6460,45 @@ define void @v_shuffle_v3p0_v4p0__7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_0_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v16, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_0_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v16, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -6800,16 +6513,14 @@ define void @v_shuffle_v3p0_v4p0__7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6817,16 +6528,14 @@ define void @v_shuffle_v3p0_v4p0__7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6834,16 +6543,14 @@ define void @v_shuffle_v3p0_v4p0__7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -6858,16 +6565,14 @@ define void @v_shuffle_v3p0_v4p0__7_2_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6875,16 +6580,14 @@ define void @v_shuffle_v3p0_v4p0__7_2_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6892,16 +6595,14 @@ define void @v_shuffle_v3p0_v4p0__7_2_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[10:11], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -6916,16 +6617,14 @@ define void @v_shuffle_v3p0_v4p0__7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6933,16 +6632,14 @@ define void @v_shuffle_v3p0_v4p0__7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6950,16 +6647,14 @@ define void @v_shuffle_v3p0_v4p0__7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[12:13], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -6973,48 +6668,42 @@ define void @v_shuffle_v3p0_v4p0__7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_4_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_4_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_4_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -7028,42 +6717,42 @@ define void @v_shuffle_v3p0_v4p0__7_5_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_5_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_5_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_5_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -7298,13 +6987,13 @@ define void @v_shuffle_v3p0_v4p0__3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v14 +; GFX900-NEXT: v_mov_b32_e32 v9, v15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v6 -; GFX900-NEXT: v_mov_b32_e32 v13, v7 ; GFX900-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7315,13 +7004,13 @@ define void @v_shuffle_v3p0_v4p0__3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v14 +; GFX90A-NEXT: v_mov_b32_e32 v9, v15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, v6 -; GFX90A-NEXT: v_mov_b32_e32 v13, v7 ; GFX90A-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7332,13 +7021,13 @@ define void @v_shuffle_v3p0_v4p0__3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v14 +; GFX942-NEXT: v_mov_b32_e32 v9, v15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: global_store_dwordx2 v16, v[14:15], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v12, v6 -; GFX942-NEXT: v_mov_b32_e32 v13, v7 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -7493,42 +7182,36 @@ define void @v_shuffle_v3p0_v4p0__7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_u_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 ; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_u_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 ; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_u_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 ; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -7542,57 +7225,45 @@ define void @v_shuffle_v3p0_v4p0__7_0_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_0_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_0_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -7607,16 +7278,14 @@ define void @v_shuffle_v3p0_v4p0__7_1_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v14, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7624,16 +7293,14 @@ define void @v_shuffle_v3p0_v4p0__7_1_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v14, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7641,16 +7308,14 @@ define void @v_shuffle_v3p0_v4p0__7_1_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v14, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -7665,16 +7330,14 @@ define void @v_shuffle_v3p0_v4p0__7_2_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7682,16 +7345,14 @@ define void @v_shuffle_v3p0_v4p0__7_2_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7699,16 +7360,14 @@ define void @v_shuffle_v3p0_v4p0__7_2_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v14, v[12:13], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -7723,16 +7382,14 @@ define void @v_shuffle_v3p0_v4p0__7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7740,16 +7397,14 @@ define void @v_shuffle_v3p0_v4p0__7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7757,16 +7412,14 @@ define void @v_shuffle_v3p0_v4p0__7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v16, v[14:15], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -7780,48 +7433,42 @@ define void @v_shuffle_v3p0_v4p0__7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_4_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_4_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_4_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -7835,42 +7482,42 @@ define void @v_shuffle_v3p0_v4p0__7_5_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_5_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_5_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_5_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -7884,42 +7531,42 @@ define void @v_shuffle_v3p0_v4p0__7_6_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_6_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_6_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_6_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -8055,10 +7702,9 @@ define void @s_shuffle_v3p0_v4p0__2_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -8201,10 +7847,9 @@ define void @s_shuffle_v3p0_v4p0__6_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -8477,15 +8122,14 @@ define void @s_shuffle_v3p0_v4p0__7_3_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -8622,134 +8266,62 @@ define void @s_shuffle_v3p0_v4p0__7_6_u() { } define void @s_shuffle_v3p0_v4p0__7_7_u() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_7_u: +; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_7_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_7_0() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_7_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_7_1() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_7_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_7_u: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_7_u: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v4p0__7_7_0() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_7_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_7_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_7_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v4p0__7_7_1() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_7_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8759,17 +8331,15 @@ define void @s_shuffle_v3p0_v4p0__7_7_1() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8838,13 +8408,12 @@ define void @s_shuffle_v3p0_v4p0__7_7_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -8899,15 +8468,13 @@ define void @s_shuffle_v3p0_v4p0__7_7_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s14 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -8920,52 +8487,20 @@ define void @s_shuffle_v3p0_v4p0__7_7_3() { } define void @s_shuffle_v3p0_v4p0__7_7_4() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_7_4: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_7_4: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_7_4: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_7_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> @@ -8974,1103 +8509,58 @@ define void @s_shuffle_v3p0_v4p0__7_7_4() { } define void @s_shuffle_v3p0_v4p0__7_7_5() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_7_5: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_7_5: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_7_5: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v4p0__7_7_6() { -; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_7_6: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s14 -; GFX9-NEXT: s_mov_b32 s9, s15 -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v4p0__7_7_7() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_7_7: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_7_7: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_7_7: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v4p0__u_0_0() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__u_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__u_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__u_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v4p0__0_0_0() { -; GFX9-LABEL: s_shuffle_v3p0_v4p0__0_0_0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s11, s9 -; GFX9-NEXT: s_mov_b32 s12, s8 -; GFX9-NEXT: s_mov_b32 s13, s9 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> zeroinitializer - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v4p0__1_0_0() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__1_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v4p0__2_0_0() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__2_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v4p0__3_0_0() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__3_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v4p0__4_0_0() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__4_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__4_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__4_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v4p0__5_0_0() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v4p0__6_0_0() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v4p0__7_0_0() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v4p0__7_u_0() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v4p0__7_1_0() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s6 -; GFX900-NEXT: s_mov_b32 s11, s7 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s6 -; GFX90A-NEXT: s_mov_b32 s11, s7 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s2 -; GFX942-NEXT: s_mov_b32 s11, s3 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v4p0__7_2_0() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v4p0__7_3_0() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_3_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v4p0__7_4_0() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v4p0__7_5_0() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v4p0__7_6_0() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_7_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__u_1_1() { -; GFX9-LABEL: s_shuffle_v3p0_v4p0__u_1_1: +define void @s_shuffle_v3p0_v4p0__7_7_6() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_7_6: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:15] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s10, s14 +; GFX9-NEXT: s_mov_b32 s11, s15 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__0_1_1() { -; GFX9-LABEL: s_shuffle_v3p0_v4p0__0_1_1: +define void @s_shuffle_v3p0_v4p0__7_7_7() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_7_7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s12, s10 ; GFX9-NEXT: s_mov_b32 s13, s11 ; GFX9-NEXT: ;;#ASMSTART @@ -10078,473 +8568,397 @@ define void @s_shuffle_v3p0_v4p0__0_1_1() { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__1_1_1() { -; GFX9-LABEL: s_shuffle_v3p0_v4p0__1_1_1: +define void @s_shuffle_v3p0_v4p0__u_0_0() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__u_0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[12:19] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__2_1_1() { -; GFX9-LABEL: s_shuffle_v3p0_v4p0__2_1_1: +define void @s_shuffle_v3p0_v4p0__0_0_0() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__0_0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:15] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> zeroinitializer call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__3_1_1() { -; GFX9-LABEL: s_shuffle_v3p0_v4p0__3_1_1: +define void @s_shuffle_v3p0_v4p0__1_0_0() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__1_0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[12:19] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_mov_b32 s8, s14 ; GFX9-NEXT: s_mov_b32 s9, s15 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__4_1_1() { -; GFX9-LABEL: s_shuffle_v3p0_v4p0__4_1_1: +define void @s_shuffle_v3p0_v4p0__2_0_0() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__2_0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__5_1_1() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v3p0_v4p0__3_0_0() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__3_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__6_1_1() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v3p0_v4p0__4_0_0() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__4_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_1_1() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_1: +define void @s_shuffle_v3p0_v4p0__5_0_0() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_1: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_1: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_u_1() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_1: +define void @s_shuffle_v3p0_v4p0__6_0_0() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__6_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_0_0() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_1: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_1: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_0_1() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_1: +define void @s_shuffle_v3p0_v4p0__7_u_0() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_1: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_1: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_2_1() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_1: +define void @s_shuffle_v3p0_v4p0__7_1_0() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_1: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_1: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_3_1() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_1: +define void @s_shuffle_v3p0_v4p0__7_2_0() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -10553,16 +8967,16 @@ define void @s_shuffle_v3p0_v4p0__7_3_1() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_1: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -10571,63 +8985,84 @@ define void @s_shuffle_v3p0_v4p0__7_3_1() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_3_1: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s16 +; GFX942-NEXT: s_mov_b32 s11, s17 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_4_1() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_1: +define void @s_shuffle_v3p0_v4p0__7_3_0() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_3_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_4_0() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_1: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -10636,45 +9071,41 @@ define void @s_shuffle_v3p0_v4p0__7_4_1() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_1: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_5_1() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_1: +define void @s_shuffle_v3p0_v4p0__7_5_0() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -10685,14 +9116,14 @@ define void @s_shuffle_v3p0_v4p0__7_5_1() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_1: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -10703,14 +9134,14 @@ define void @s_shuffle_v3p0_v4p0__7_5_1() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_1: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -10721,126 +9152,120 @@ define void @s_shuffle_v3p0_v4p0__7_5_1() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s14 ; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s12, s0 +; GFX942-NEXT: s_mov_b32 s13, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_6_1() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_1: +define void @s_shuffle_v3p0_v4p0__7_6_0() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_1: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_1: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__u_2_2() { -; GFX9-LABEL: s_shuffle_v3p0_v4p0__u_2_2: +define void @s_shuffle_v3p0_v4p0__u_1_1() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__u_1_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:15] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__0_2_2() { -; GFX9-LABEL: s_shuffle_v3p0_v4p0__0_2_2: +define void @s_shuffle_v3p0_v4p0__0_1_1() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__0_1_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:15] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__1_2_2() { -; GFX9-LABEL: s_shuffle_v3p0_v4p0__1_2_2: +define void @s_shuffle_v3p0_v4p0__1_1_1() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__1_1_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART @@ -10848,41 +9273,41 @@ define void @s_shuffle_v3p0_v4p0__1_2_2() { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__2_2_2() { -; GFX9-LABEL: s_shuffle_v3p0_v4p0__2_2_2: +define void @s_shuffle_v3p0_v4p0__2_1_1() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__2_1_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__3_2_2() { -; GFX9-LABEL: s_shuffle_v3p0_v4p0__3_2_2: +define void @s_shuffle_v3p0_v4p0__3_1_1() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__3_1_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART @@ -10890,75 +9315,75 @@ define void @s_shuffle_v3p0_v4p0__3_2_2() { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_mov_b32 s8, s14 ; GFX9-NEXT: s_mov_b32 s9, s15 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__4_2_2() { -; GFX9-LABEL: s_shuffle_v3p0_v4p0__4_2_2: +define void @s_shuffle_v3p0_v4p0__4_1_1() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__4_1_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:15] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__5_2_2() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_2_2: +define void @s_shuffle_v3p0_v4p0__5_1_1() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_2_2: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_2_2: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -10969,171 +9394,118 @@ define void @s_shuffle_v3p0_v4p0__5_2_2() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s2 ; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v4p0__6_2_2() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_2_2() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_2: +define void @s_shuffle_v3p0_v4p0__6_1_1() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_2: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_2: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s10, s2 +; GFX942-NEXT: s_mov_b32 s11, s3 +; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_u_2() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_2: +define void @s_shuffle_v3p0_v4p0__7_1_1() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_2: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_2: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -11144,177 +9516,189 @@ define void @s_shuffle_v3p0_v4p0__7_u_2() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_0_2() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_2: +define void @s_shuffle_v3p0_v4p0__7_u_1() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_2: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_2: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_1_2() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_2: +define void @s_shuffle_v3p0_v4p0__7_0_1() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_2: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_2: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_3_2() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_2: +define void @s_shuffle_v3p0_v4p0__7_2_1() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_2: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_3_2: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -11323,120 +9707,153 @@ define void @s_shuffle_v3p0_v4p0__7_3_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_4_2() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_2: +define void @s_shuffle_v3p0_v4p0__7_3_1() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_3_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_4_1() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_2: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_2: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_5_2() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_2: +define void @s_shuffle_v3p0_v4p0__7_5_1() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_2: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_2: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -11447,57 +9864,61 @@ define void @s_shuffle_v3p0_v4p0__7_5_2() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s14 ; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 +; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_6_2() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_2: +define void @s_shuffle_v3p0_v4p0__7_6_1() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_2: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_2: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -11506,357 +9927,418 @@ define void @s_shuffle_v3p0_v4p0__7_6_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__u_3_3() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__u_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__u_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__u_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v3p0_v4p0__u_2_2() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__u_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__0_3_3() { -; GFX9-LABEL: s_shuffle_v3p0_v4p0__0_3_3: +define void @s_shuffle_v3p0_v4p0__0_2_2() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__0_2_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:15] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 -; GFX9-NEXT: s_mov_b32 s12, s14 -; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__1_3_3() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_3_3: +define void @s_shuffle_v3p0_v4p0__1_2_2() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__1_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__2_2_2() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__3_2_2() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__3_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__4_2_2() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__4_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__5_2_2() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_3_3: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__1_3_3: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s2 ; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__2_3_3() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_3_3: +define void @s_shuffle_v3p0_v4p0__6_2_2() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_3_3: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__2_3_3: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__3_3_3() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_3_3: +define void @s_shuffle_v3p0_v4p0__7_2_2() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_3_3: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__3_3_3: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__4_3_3() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__4_3_3: +define void @s_shuffle_v3p0_v4p0__7_u_2() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__4_3_3: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__4_3_3: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__5_3_3() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_3_3: +define void @s_shuffle_v3p0_v4p0__7_0_2() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_3_3: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_3_3: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -11865,61 +10347,57 @@ define void @s_shuffle_v3p0_v4p0__5_3_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__6_3_3() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_3_3: +define void @s_shuffle_v3p0_v4p0__7_1_2() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_3_3: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_3_3: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -11928,25 +10406,21 @@ define void @s_shuffle_v3p0_v4p0__6_3_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_3_3() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_3: +define void @s_shuffle_v3p0_v4p0__7_3_2() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -11959,14 +10433,12 @@ define void @s_shuffle_v3p0_v4p0__7_3_3() { ; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: s_mov_b32 s10, s14 ; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_3: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -11979,77 +10451,73 @@ define void @s_shuffle_v3p0_v4p0__7_3_3() { ; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: s_mov_b32 s10, s14 ; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_3_3: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_u_3() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_3: +define void @s_shuffle_v3p0_v4p0__7_4_2() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_3: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_3: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -12058,63 +10526,59 @@ define void @s_shuffle_v3p0_v4p0__7_u_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_0_3() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_3: +define void @s_shuffle_v3p0_v4p0__7_5_2() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s18 -; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_3: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s18 -; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_3: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -12125,23 +10589,21 @@ define void @s_shuffle_v3p0_v4p0__7_0_3() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s14 ; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_1_3() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_3: +define void @s_shuffle_v3p0_v4p0__7_6_2() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -12152,14 +10614,14 @@ define void @s_shuffle_v3p0_v4p0__7_1_3() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s22 ; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_3: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -12170,14 +10632,14 @@ define void @s_shuffle_v3p0_v4p0__7_1_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s22 ; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_3: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -12188,88 +10650,202 @@ define void @s_shuffle_v3p0_v4p0__7_1_3() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_2_3() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_3: +define void @s_shuffle_v3p0_v4p0__u_3_3() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__u_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__0_3_3() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__0_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s14 +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__1_3_3() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__1_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__2_3_3() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__2_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__3_3_3() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__4_3_3() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__4_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__5_3_3() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_3: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_3: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s10 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s13, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_4_3() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_3: +define void @s_shuffle_v3p0_v4p0__6_3_3() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -12278,10 +10854,8 @@ define void @s_shuffle_v3p0_v4p0__7_4_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 ; GFX900-NEXT: s_mov_b32 s12, s14 ; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART @@ -12289,7 +10863,7 @@ define void @s_shuffle_v3p0_v4p0__7_4_3() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_3: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -12298,10 +10872,8 @@ define void @s_shuffle_v3p0_v4p0__7_4_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 ; GFX90A-NEXT: s_mov_b32 s12, s14 ; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART @@ -12309,135 +10881,129 @@ define void @s_shuffle_v3p0_v4p0__7_4_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_3: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:19] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s14 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s18 -; GFX942-NEXT: s_mov_b32 s9, s19 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_5_3() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_3: +define void @s_shuffle_v3p0_v4p0__7_3_3() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s18 -; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_3: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s18 -; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_3: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s10 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s13, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_6_3() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_3: +define void @s_shuffle_v3p0_v4p0__7_u_3() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_3: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_3: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -12448,8 +11014,6 @@ define void @s_shuffle_v3p0_v4p0__7_6_3() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s14 ; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: s_mov_b32 s12, s6 ; GFX942-NEXT: s_mov_b32 s13, s7 ; GFX942-NEXT: ;;#ASMSTART @@ -12458,620 +11022,687 @@ define void @s_shuffle_v3p0_v4p0__7_6_3() { ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v4p0__u_4_4() { -; GFX9-LABEL: s_shuffle_v3p0_v4p0__u_4_4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__0_4_4() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_4_4: +define void @s_shuffle_v3p0_v4p0__7_0_3() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_4_4: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__0_4_4: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v4p0__1_4_4() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_4_4: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_4_4: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__1_4_4: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__2_4_4() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_4_4: +define void @s_shuffle_v3p0_v4p0__7_1_3() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_4_4: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__2_4_4: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__3_4_4() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_4_4: +define void @s_shuffle_v3p0_v4p0__7_2_3() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_4_4: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__3_4_4: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v4p0__4_4_4() { -; GFX9-LABEL: s_shuffle_v3p0_v4p0__4_4_4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__5_4_4() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_4_4: +define void @s_shuffle_v3p0_v4p0__7_4_3() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_4_4: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_4_4: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__6_4_4() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_4_4: +define void @s_shuffle_v3p0_v4p0__7_5_3() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_4_4: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_4_4: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_4_4() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_4: +define void @s_shuffle_v3p0_v4p0__7_6_3() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_4: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_4: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_u_4() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_4: +define void @s_shuffle_v3p0_v4p0__u_4_4() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__u_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__0_4_4() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_4: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_4: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__0_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_0_4() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_4: +define void @s_shuffle_v3p0_v4p0__1_4_4() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_4: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_4: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__1_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_1_4() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_4: +define void @s_shuffle_v3p0_v4p0__2_4_4() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_4: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_4: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__2_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_2_4() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_4: +define void @s_shuffle_v3p0_v4p0__3_4_4() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_4: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_4: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__3_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:19] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s18 -; GFX942-NEXT: s_mov_b32 s9, s19 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__4_4_4() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__5_4_4() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__5_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_3_4() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_4: +define void @s_shuffle_v3p0_v4p0__6_4_4() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__6_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_4_4() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_u_4() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_u_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_0_4() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -13082,12 +11713,14 @@ define void @s_shuffle_v3p0_v4p0__7_3_4() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s18 ; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_4: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -13098,12 +11731,14 @@ define void @s_shuffle_v3p0_v4p0__7_3_4() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s18 ; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_3_4: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -13114,124 +11749,196 @@ define void @s_shuffle_v3p0_v4p0__7_3_4() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s18 ; GFX942-NEXT: s_mov_b32 s9, s19 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_5_4() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_4: +define void @s_shuffle_v3p0_v4p0__7_1_4() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s6 -; GFX900-NEXT: s_mov_b32 s11, s7 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_4: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s6 -; GFX90A-NEXT: s_mov_b32 s11, s7 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_4: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s2 -; GFX942-NEXT: s_mov_b32 s11, s3 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) ret void } -define void @s_shuffle_v3p0_v4p0__7_6_4() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_4: +define void @s_shuffle_v3p0_v4p0__7_2_4() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s18 ; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_4: +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_4: +; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 ; GFX942-NEXT: s_mov_b32 s10, s4 ; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_3_4() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_3_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_5_4() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_5_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s10, s14 +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_6_4() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_6_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s10, s16 +; GFX9-NEXT: s_mov_b32 s11, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> @@ -13386,15 +12093,15 @@ define void @s_shuffle_v3p0_v4p0__2_5_5() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -13404,15 +12111,15 @@ define void @s_shuffle_v3p0_v4p0__2_5_5() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -13422,15 +12129,16 @@ define void @s_shuffle_v3p0_v4p0__2_5_5() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s10, s2 +; GFX942-NEXT: s_mov_b32 s11, s3 +; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -13550,12 +12258,12 @@ define void @s_shuffle_v3p0_v4p0__6_5_5() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND @@ -13878,17 +12586,16 @@ define void @s_shuffle_v3p0_v4p0__7_3_5() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:19] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s18 -; GFX942-NEXT: s_mov_b32 s9, s19 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -14163,15 +12870,15 @@ define void @s_shuffle_v3p0_v4p0__2_6_6() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -14181,15 +12888,15 @@ define void @s_shuffle_v3p0_v4p0__2_6_6() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -14199,15 +12906,16 @@ define void @s_shuffle_v3p0_v4p0__2_6_6() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -14327,12 +13035,12 @@ define void @s_shuffle_v3p0_v4p0__6_6_6() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:13] ; GFX9-NEXT: ;;#ASMEND @@ -14574,15 +13282,15 @@ define void @s_shuffle_v3p0_v4p0__7_3_6() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -14592,15 +13300,15 @@ define void @s_shuffle_v3p0_v4p0__7_3_6() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -14610,15 +13318,16 @@ define void @s_shuffle_v3p0_v4p0__7_3_6() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -14671,88 +13380,56 @@ define void @s_shuffle_v3p0_v4p0__7_4_6() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v4p0__7_5_6() { -; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_5_6: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s14 -; GFX9-NEXT: s_mov_b32 s9, s15 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v3p0_v4p0__u_7_7() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__u_7_7: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__u_7_7: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__u_7_7: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_5_6() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_5_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__u_7_7() { +; GFX9-LABEL: s_shuffle_v3p0_v4p0__u_7_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> @@ -14826,17 +13503,15 @@ define void @s_shuffle_v3p0_v4p0__1_7_7() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -14846,17 +13521,15 @@ define void @s_shuffle_v3p0_v4p0__1_7_7() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -14866,17 +13539,16 @@ define void @s_shuffle_v3p0_v4p0__1_7_7() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s2 ; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -14929,17 +13601,16 @@ define void @s_shuffle_v3p0_v4p0__2_7_7() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s10, s6 +; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -14956,17 +13627,15 @@ define void @s_shuffle_v3p0_v4p0__3_7_7() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s12, s18 -; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -14976,17 +13645,15 @@ define void @s_shuffle_v3p0_v4p0__3_7_7() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s12, s18 -; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -14998,15 +13665,14 @@ define void @s_shuffle_v3p0_v4p0__3_7_7() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND @@ -15041,56 +13707,20 @@ define void @s_shuffle_v3p0_v4p0__4_7_7() { } define void @s_shuffle_v3p0_v4p0__5_7_7() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_7_7: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_7_7: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_7_7: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p0_v4p0__5_7_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> @@ -15099,56 +13729,18 @@ define void @s_shuffle_v3p0_v4p0__5_7_7() { } define void @s_shuffle_v3p0_v4p0__6_7_7() { -; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_7_7: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_7_7: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_7_7: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p0_v4p0__6_7_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> @@ -15444,17 +14036,16 @@ define void @s_shuffle_v3p0_v4p0__7_3_7() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] ; GFX942-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll index 1434189e6bda1..db1cf9faff8ec 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll @@ -776,15 +776,14 @@ define void @v_shuffle_v3p3_v2p3__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll index 0c5fe591656bb..cbcb0b64e2ef8 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll @@ -99,36 +99,33 @@ define void @v_shuffle_v3p3_v3p3__2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__2_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v3, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -194,36 +191,33 @@ define void @v_shuffle_v3p3_v3p3__5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v3, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -237,48 +231,45 @@ define void @v_shuffle_v3p3_v3p3__5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_0_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_0_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -292,46 +283,43 @@ define void @v_shuffle_v3p3_v3p3__5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_1_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_1_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -345,15 +333,14 @@ define void @v_shuffle_v3p3_v3p3__5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -361,15 +348,14 @@ define void @v_shuffle_v3p3_v3p3__5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -377,15 +363,14 @@ define void @v_shuffle_v3p3_v3p3__5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -401,37 +386,35 @@ define void @v_shuffle_v3p3_v3p3__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -445,36 +428,37 @@ define void @v_shuffle_v3p3_v3p3__5_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -488,38 +472,37 @@ define void @v_shuffle_v3p3_v3p3__5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -533,50 +516,46 @@ define void @v_shuffle_v3p3_v3p3__5_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -591,14 +570,14 @@ define void @v_shuffle_v3p3_v3p3__5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -606,16 +585,15 @@ define void @v_shuffle_v3p3_v3p3__5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -623,17 +601,15 @@ define void @v_shuffle_v3p3_v3p3__5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -648,14 +624,14 @@ define void @v_shuffle_v3p3_v3p3__5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -663,15 +639,14 @@ define void @v_shuffle_v3p3_v3p3__5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -679,16 +654,15 @@ define void @v_shuffle_v3p3_v3p3__5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -702,42 +676,40 @@ define void @v_shuffle_v3p3_v3p3__5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_5_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -751,42 +723,40 @@ define void @v_shuffle_v3p3_v3p3__5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -800,39 +770,40 @@ define void @v_shuffle_v3p3_v3p3__5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -858,26 +829,25 @@ define void @v_shuffle_v3p3_v3p3__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -948,29 +918,27 @@ define void @v_shuffle_v3p3_v3p3__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -983,42 +951,40 @@ define void @v_shuffle_v3p3_v3p3__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__2_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1043,26 +1009,25 @@ define void @v_shuffle_v3p3_v3p3__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__3_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__3_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1091,16 +1056,15 @@ define void @v_shuffle_v3p3_v3p3__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1108,17 +1072,15 @@ define void @v_shuffle_v3p3_v3p3__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx3 v7, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1133,50 +1095,45 @@ define void @v_shuffle_v3p3_v3p3__5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1190,49 +1147,42 @@ define void @v_shuffle_v3p3_v3p3__5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v3, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1246,52 +1196,46 @@ define void @v_shuffle_v3p3_v3p3__5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_1_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1305,50 +1249,46 @@ define void @v_shuffle_v3p3_v3p3__5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v6 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1362,50 +1302,46 @@ define void @v_shuffle_v3p3_v3p3__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_3_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1419,49 +1355,46 @@ define void @v_shuffle_v3p3_v3p3__5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_4_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_4_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1604,39 +1537,40 @@ define void @v_shuffle_v3p3_v3p3__2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__2_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__2_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__2_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1747,15 +1681,14 @@ define void @v_shuffle_v3p3_v3p3__5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1763,15 +1696,14 @@ define void @v_shuffle_v3p3_v3p3__5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1779,16 +1711,14 @@ define void @v_shuffle_v3p3_v3p3__5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1802,15 +1732,14 @@ define void @v_shuffle_v3p3_v3p3__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1818,15 +1747,14 @@ define void @v_shuffle_v3p3_v3p3__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1834,16 +1762,14 @@ define void @v_shuffle_v3p3_v3p3__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1857,15 +1783,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1873,16 +1798,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1890,17 +1814,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1915,15 +1837,14 @@ define void @v_shuffle_v3p3_v3p3__5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1933,14 +1854,13 @@ define void @v_shuffle_v3p3_v3p3__5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1950,14 +1870,13 @@ define void @v_shuffle_v3p3_v3p3__5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1974,13 +1893,12 @@ define void @v_shuffle_v3p3_v3p3__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1988,16 +1906,15 @@ define void @v_shuffle_v3p3_v3p3__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2005,16 +1922,15 @@ define void @v_shuffle_v3p3_v3p3__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2029,15 +1945,14 @@ define void @v_shuffle_v3p3_v3p3__5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2045,15 +1960,15 @@ define void @v_shuffle_v3p3_v3p3__5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2061,16 +1976,15 @@ define void @v_shuffle_v3p3_v3p3__5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2212,39 +2126,40 @@ define void @v_shuffle_v3p3_v3p3__2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2353,16 +2268,15 @@ define void @v_shuffle_v3p3_v3p3__5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2370,15 +2284,14 @@ define void @v_shuffle_v3p3_v3p3__5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2386,15 +2299,14 @@ define void @v_shuffle_v3p3_v3p3__5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2408,46 +2320,43 @@ define void @v_shuffle_v3p3_v3p3__5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2462,15 +2371,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2480,14 +2388,13 @@ define void @v_shuffle_v3p3_v3p3__5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2497,14 +2404,13 @@ define void @v_shuffle_v3p3_v3p3__5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2518,46 +2424,43 @@ define void @v_shuffle_v3p3_v3p3__5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_1_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_1_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2574,13 +2477,12 @@ define void @v_shuffle_v3p3_v3p3__5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2588,15 +2490,14 @@ define void @v_shuffle_v3p3_v3p3__5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2604,16 +2505,15 @@ define void @v_shuffle_v3p3_v3p3__5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2628,15 +2528,14 @@ define void @v_shuffle_v3p3_v3p3__5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2644,15 +2543,14 @@ define void @v_shuffle_v3p3_v3p3__5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2660,15 +2558,15 @@ define void @v_shuffle_v3p3_v3p3__5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2773,36 +2671,33 @@ define void @v_shuffle_v3p3_v3p3__2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__2_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v3, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2839,29 +2734,27 @@ define void @v_shuffle_v3p3_v3p3__4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2875,42 +2768,40 @@ define void @v_shuffle_v3p3_v3p3__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2924,13 +2815,12 @@ define void @v_shuffle_v3p3_v3p3__5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2970,14 +2860,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2985,16 +2875,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3002,16 +2891,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3026,14 +2914,14 @@ define void @v_shuffle_v3p3_v3p3__5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3041,14 +2929,14 @@ define void @v_shuffle_v3p3_v3p3__5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3056,15 +2944,15 @@ define void @v_shuffle_v3p3_v3p3__5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3079,14 +2967,14 @@ define void @v_shuffle_v3p3_v3p3__5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3094,16 +2982,15 @@ define void @v_shuffle_v3p3_v3p3__5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3111,16 +2998,15 @@ define void @v_shuffle_v3p3_v3p3__5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3147,29 +3033,27 @@ define void @v_shuffle_v3p3_v3p3__5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_4_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3340,12 +3224,11 @@ define void @v_shuffle_v3p3_v3p3__2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v4 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3355,13 +3238,13 @@ define void @v_shuffle_v3p3_v3p3__2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3371,13 +3254,13 @@ define void @v_shuffle_v3p3_v3p3__2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3480,39 +3363,40 @@ define void @v_shuffle_v3p3_v3p3__5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3526,38 +3410,37 @@ define void @v_shuffle_v3p3_v3p3__5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v3, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3572,15 +3455,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3588,16 +3470,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3605,16 +3486,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3629,15 +3509,14 @@ define void @v_shuffle_v3p3_v3p3__5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3645,15 +3524,14 @@ define void @v_shuffle_v3p3_v3p3__5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3661,16 +3539,15 @@ define void @v_shuffle_v3p3_v3p3__5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3685,15 +3562,14 @@ define void @v_shuffle_v3p3_v3p3__5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3701,16 +3577,15 @@ define void @v_shuffle_v3p3_v3p3__5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3718,16 +3593,15 @@ define void @v_shuffle_v3p3_v3p3__5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3741,41 +3615,40 @@ define void @v_shuffle_v3p3_v3p3__5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3945,15 +3818,15 @@ define void @v_shuffle_v3p3_v3p3__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3963,13 +3836,13 @@ define void @v_shuffle_v3p3_v3p3__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3979,13 +3852,13 @@ define void @v_shuffle_v3p3_v3p3__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4087,36 +3960,37 @@ define void @v_shuffle_v3p3_v3p3__5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_u_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_u_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v3, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4131,15 +4005,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4147,15 +4020,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4163,16 +4036,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4187,15 +4059,14 @@ define void @v_shuffle_v3p3_v3p3__5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4203,15 +4074,14 @@ define void @v_shuffle_v3p3_v3p3__5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4219,16 +4089,15 @@ define void @v_shuffle_v3p3_v3p3__5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4243,15 +4112,14 @@ define void @v_shuffle_v3p3_v3p3__5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4259,15 +4127,15 @@ define void @v_shuffle_v3p3_v3p3__5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4275,15 +4143,15 @@ define void @v_shuffle_v3p3_v3p3__5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4297,42 +4165,40 @@ define void @v_shuffle_v3p3_v3p3__5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_3_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_3_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4346,36 +4212,40 @@ define void @v_shuffle_v3p3_v3p3__5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll index c9f194d873e35..6127b40404f5e 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll @@ -99,36 +99,33 @@ define void @v_shuffle_v3p3_v4p3__2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__2_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -141,12 +138,11 @@ define void @v_shuffle_v3p3_v4p3__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -236,36 +232,33 @@ define void @v_shuffle_v3p3_v4p3__6_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__6_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__6_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__6_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -279,12 +272,11 @@ define void @v_shuffle_v3p3_v4p3__7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -322,16 +314,14 @@ define void @v_shuffle_v3p3_v4p3__7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -378,15 +368,14 @@ define void @v_shuffle_v3p3_v4p3__7_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -431,15 +420,14 @@ define void @v_shuffle_v3p3_v4p3__7_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -485,16 +473,14 @@ define void @v_shuffle_v3p3_v4p3__7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -508,9 +494,8 @@ define void @v_shuffle_v3p3_v4p3__7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -524,9 +509,9 @@ define void @v_shuffle_v3p3_v4p3__7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -540,13 +525,12 @@ define void @v_shuffle_v3p3_v4p3__7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -586,12 +570,12 @@ define void @v_shuffle_v3p3_v4p3__7_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -629,12 +613,12 @@ define void @v_shuffle_v3p3_v4p3__7_6_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_6_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -674,13 +658,12 @@ define void @v_shuffle_v3p3_v4p3__7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_7_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -691,9 +674,8 @@ define void @v_shuffle_v3p3_v4p3__7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -704,9 +686,8 @@ define void @v_shuffle_v3p3_v4p3__7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -720,17 +701,15 @@ define void @v_shuffle_v3p3_v4p3__7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_7_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v9, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -740,14 +719,12 @@ define void @v_shuffle_v3p3_v4p3__7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -757,15 +734,12 @@ define void @v_shuffle_v3p3_v4p3__7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -780,16 +754,14 @@ define void @v_shuffle_v3p3_v4p3__7_7_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -797,16 +769,15 @@ define void @v_shuffle_v3p3_v4p3__7_7_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -814,17 +785,15 @@ define void @v_shuffle_v3p3_v4p3__7_7_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -839,15 +808,14 @@ define void @v_shuffle_v3p3_v4p3__7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -855,15 +823,14 @@ define void @v_shuffle_v3p3_v4p3__7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -871,16 +838,15 @@ define void @v_shuffle_v3p3_v4p3__7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -895,15 +861,14 @@ define void @v_shuffle_v3p3_v4p3__7_7_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v7 -; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -911,16 +876,15 @@ define void @v_shuffle_v3p3_v4p3__7_7_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -928,16 +892,15 @@ define void @v_shuffle_v3p3_v4p3__7_7_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -951,43 +914,39 @@ define void @v_shuffle_v3p3_v4p3__7_7_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_7_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_7_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_7_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1001,42 +960,39 @@ define void @v_shuffle_v3p3_v4p3__7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_7_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1050,13 +1006,13 @@ define void @v_shuffle_v3p3_v4p3__7_7_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_7_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1096,13 +1052,13 @@ define void @v_shuffle_v3p3_v4p3__7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1112,11 +1068,10 @@ define void @v_shuffle_v3p3_v4p3__7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1126,11 +1081,10 @@ define void @v_shuffle_v3p3_v4p3__7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1145,38 +1099,36 @@ define void @v_shuffle_v3p3_v4p3__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1247,29 +1199,27 @@ define void @v_shuffle_v3p3_v4p3__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1285,40 +1235,36 @@ define void @v_shuffle_v3p3_v4p3__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1331,43 +1277,40 @@ define void @v_shuffle_v3p3_v4p3__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__3_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__3_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__3_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1381,38 +1324,36 @@ define void @v_shuffle_v3p3_v4p3__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__4_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__4_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1442,16 +1383,15 @@ define void @v_shuffle_v3p3_v4p3__5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1459,17 +1399,15 @@ define void @v_shuffle_v3p3_v4p3__5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1486,14 +1424,12 @@ define void @v_shuffle_v3p3_v4p3__6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1503,14 +1439,12 @@ define void @v_shuffle_v3p3_v4p3__6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1520,15 +1454,13 @@ define void @v_shuffle_v3p3_v4p3__6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1543,16 +1475,14 @@ define void @v_shuffle_v3p3_v4p3__7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1560,16 +1490,15 @@ define void @v_shuffle_v3p3_v4p3__7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1577,17 +1506,15 @@ define void @v_shuffle_v3p3_v4p3__7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1601,16 +1528,14 @@ define void @v_shuffle_v3p3_v4p3__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1618,15 +1543,14 @@ define void @v_shuffle_v3p3_v4p3__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: global_store_dwordx3 v1, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1634,16 +1558,15 @@ define void @v_shuffle_v3p3_v4p3__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: global_store_dwordx3 v1, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1657,17 +1580,15 @@ define void @v_shuffle_v3p3_v4p3__7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v9, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1675,16 +1596,15 @@ define void @v_shuffle_v3p3_v4p3__7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1692,17 +1612,15 @@ define void @v_shuffle_v3p3_v4p3__7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1716,50 +1634,48 @@ define void @v_shuffle_v3p3_v4p3__7_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v9, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v9 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx3 v10, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v7 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v9 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx3 v10, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1773,17 +1689,15 @@ define void @v_shuffle_v3p3_v4p3__7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v9, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1829,17 +1743,15 @@ define void @v_shuffle_v3p3_v4p3__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v9, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1847,16 +1759,15 @@ define void @v_shuffle_v3p3_v4p3__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1864,16 +1775,16 @@ define void @v_shuffle_v3p3_v4p3__7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1887,16 +1798,15 @@ define void @v_shuffle_v3p3_v4p3__7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v9, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1943,16 +1853,15 @@ define void @v_shuffle_v3p3_v4p3__7_6_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_6_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v9, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1960,16 +1869,15 @@ define void @v_shuffle_v3p3_v4p3__7_6_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1977,17 +1885,16 @@ define void @v_shuffle_v3p3_v4p3__7_6_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2133,10 +2040,10 @@ define void @v_shuffle_v3p3_v4p3__2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2146,10 +2053,10 @@ define void @v_shuffle_v3p3_v4p3__2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2159,10 +2066,10 @@ define void @v_shuffle_v3p3_v4p3__2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2175,13 +2082,13 @@ define void @v_shuffle_v3p3_v4p3__3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__3_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2319,15 +2226,14 @@ define void @v_shuffle_v3p3_v4p3__6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2335,15 +2241,15 @@ define void @v_shuffle_v3p3_v4p3__6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2351,16 +2257,15 @@ define void @v_shuffle_v3p3_v4p3__6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2375,15 +2280,14 @@ define void @v_shuffle_v3p3_v4p3__7_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2430,16 +2334,14 @@ define void @v_shuffle_v3p3_v4p3__7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2486,17 +2388,14 @@ define void @v_shuffle_v3p3_v4p3__7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2546,16 +2445,14 @@ define void @v_shuffle_v3p3_v4p3__7_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2604,15 +2501,14 @@ define void @v_shuffle_v3p3_v4p3__7_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2659,16 +2555,14 @@ define void @v_shuffle_v3p3_v4p3__7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2717,15 +2611,14 @@ define void @v_shuffle_v3p3_v4p3__7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 ; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2773,15 +2666,14 @@ define void @v_shuffle_v3p3_v4p3__7_6_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2961,10 +2853,10 @@ define void @v_shuffle_v3p3_v4p3__2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2974,10 +2866,10 @@ define void @v_shuffle_v3p3_v4p3__2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2987,10 +2879,10 @@ define void @v_shuffle_v3p3_v4p3__2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3003,13 +2895,13 @@ define void @v_shuffle_v3p3_v4p3__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3146,15 +3038,14 @@ define void @v_shuffle_v3p3_v4p3__6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3162,15 +3053,14 @@ define void @v_shuffle_v3p3_v4p3__6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3178,15 +3068,15 @@ define void @v_shuffle_v3p3_v4p3__6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3201,15 +3091,14 @@ define void @v_shuffle_v3p3_v4p3__7_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3255,15 +3144,14 @@ define void @v_shuffle_v3p3_v4p3__7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3309,16 +3197,14 @@ define void @v_shuffle_v3p3_v4p3__7_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3366,15 +3252,14 @@ define void @v_shuffle_v3p3_v4p3__7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3420,15 +3305,14 @@ define void @v_shuffle_v3p3_v4p3__7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3475,15 +3359,14 @@ define void @v_shuffle_v3p3_v4p3__7_4_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3531,14 +3414,13 @@ define void @v_shuffle_v3p3_v4p3__7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v1 ; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -3586,15 +3468,14 @@ define void @v_shuffle_v3p3_v4p3__7_6_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3641,38 +3522,37 @@ define void @v_shuffle_v3p3_v4p3__u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__u_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__u_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__u_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3745,11 +3625,10 @@ define void @v_shuffle_v3p3_v4p3__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3759,11 +3638,10 @@ define void @v_shuffle_v3p3_v4p3__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3776,41 +3654,37 @@ define void @v_shuffle_v3p3_v4p3__2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__2_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3823,13 +3697,13 @@ define void @v_shuffle_v3p3_v4p3__3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3839,11 +3713,10 @@ define void @v_shuffle_v3p3_v4p3__3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3853,11 +3726,10 @@ define void @v_shuffle_v3p3_v4p3__3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3870,38 +3742,37 @@ define void @v_shuffle_v3p3_v4p3__4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__4_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3933,14 +3804,13 @@ define void @v_shuffle_v3p3_v4p3__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3950,14 +3820,13 @@ define void @v_shuffle_v3p3_v4p3__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3972,15 +3841,14 @@ define void @v_shuffle_v3p3_v4p3__6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3988,16 +3856,15 @@ define void @v_shuffle_v3p3_v4p3__6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4005,16 +3872,15 @@ define void @v_shuffle_v3p3_v4p3__6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4028,16 +3894,15 @@ define void @v_shuffle_v3p3_v4p3__7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4047,14 +3912,13 @@ define void @v_shuffle_v3p3_v4p3__7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4064,14 +3928,13 @@ define void @v_shuffle_v3p3_v4p3__7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4085,15 +3948,14 @@ define void @v_shuffle_v3p3_v4p3__7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4140,15 +4002,14 @@ define void @v_shuffle_v3p3_v4p3__7_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4197,15 +4058,14 @@ define void @v_shuffle_v3p3_v4p3__7_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4251,15 +4111,14 @@ define void @v_shuffle_v3p3_v4p3__7_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4308,15 +4167,14 @@ define void @v_shuffle_v3p3_v4p3__7_4_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4365,15 +4223,14 @@ define void @v_shuffle_v3p3_v4p3__7_5_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4420,15 +4277,14 @@ define void @v_shuffle_v3p3_v4p3__7_6_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4567,36 +4423,33 @@ define void @v_shuffle_v3p3_v4p3__2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__2_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__2_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__2_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4609,12 +4462,11 @@ define void @v_shuffle_v3p3_v4p3__3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__3_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4675,29 +4527,27 @@ define void @v_shuffle_v3p3_v4p3__5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4714,40 +4564,36 @@ define void @v_shuffle_v3p3_v4p3__6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__6_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__6_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4761,43 +4607,40 @@ define void @v_shuffle_v3p3_v4p3__7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4811,40 +4654,37 @@ define void @v_shuffle_v3p3_v4p3__7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx3 v1, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx3 v1, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4859,50 +4699,47 @@ define void @v_shuffle_v3p3_v4p3__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_0_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v2 -; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx3 v10, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_0_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v2 -; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: global_store_dwordx3 v10, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4917,14 +4754,14 @@ define void @v_shuffle_v3p3_v4p3__7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4970,14 +4807,14 @@ define void @v_shuffle_v3p3_v4p3__7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4985,15 +4822,14 @@ define void @v_shuffle_v3p3_v4p3__7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -5002,15 +4838,14 @@ define void @v_shuffle_v3p3_v4p3__7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 ; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -5026,16 +4861,14 @@ define void @v_shuffle_v3p3_v4p3__7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5049,10 +4882,8 @@ define void @v_shuffle_v3p3_v4p3__7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5066,10 +4897,9 @@ define void @v_shuffle_v3p3_v4p3__7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5083,43 +4913,40 @@ define void @v_shuffle_v3p3_v4p3__7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5133,42 +4960,40 @@ define void @v_shuffle_v3p3_v4p3__7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_6_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_6_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_6_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5342,9 +5167,8 @@ define void @v_shuffle_v3p3_v4p3__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v4 -; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5352,15 +5176,15 @@ define void @v_shuffle_v3p3_v4p3__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5368,15 +5192,15 @@ define void @v_shuffle_v3p3_v4p3__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5394,12 +5218,11 @@ define void @v_shuffle_v3p3_v4p3__3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v5 -; GFX900-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5537,10 +5360,10 @@ define void @v_shuffle_v3p3_v4p3__6_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5550,10 +5373,10 @@ define void @v_shuffle_v3p3_v4p3__6_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5563,10 +5386,10 @@ define void @v_shuffle_v3p3_v4p3__6_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5580,13 +5403,13 @@ define void @v_shuffle_v3p3_v4p3__7_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5626,13 +5449,12 @@ define void @v_shuffle_v3p3_v4p3__7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5673,16 +5495,14 @@ define void @v_shuffle_v3p3_v4p3__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5731,15 +5551,14 @@ define void @v_shuffle_v3p3_v4p3__7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5787,15 +5606,14 @@ define void @v_shuffle_v3p3_v4p3__7_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5844,16 +5662,14 @@ define void @v_shuffle_v3p3_v4p3__7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5863,14 +5679,13 @@ define void @v_shuffle_v3p3_v4p3__7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5880,14 +5695,13 @@ define void @v_shuffle_v3p3_v4p3__7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5901,14 +5715,13 @@ define void @v_shuffle_v3p3_v4p3__7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6162,9 +5975,9 @@ define void @v_shuffle_v3p3_v4p3__2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v6, v5 -; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6172,15 +5985,15 @@ define void @v_shuffle_v3p3_v4p3__2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6188,15 +6001,15 @@ define void @v_shuffle_v3p3_v4p3__2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6214,12 +6027,12 @@ define void @v_shuffle_v3p3_v4p3__3_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v6 -; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6356,10 +6169,10 @@ define void @v_shuffle_v3p3_v4p3__6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6369,10 +6182,10 @@ define void @v_shuffle_v3p3_v4p3__6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6382,10 +6195,10 @@ define void @v_shuffle_v3p3_v4p3__6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6399,13 +6212,13 @@ define void @v_shuffle_v3p3_v4p3__7_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_6_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6445,12 +6258,12 @@ define void @v_shuffle_v3p3_v4p3__7_u_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_u_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6489,15 +6302,14 @@ define void @v_shuffle_v3p3_v4p3__7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6545,15 +6357,14 @@ define void @v_shuffle_v3p3_v4p3__7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6601,15 +6412,14 @@ define void @v_shuffle_v3p3_v4p3__7_2_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6656,15 +6466,14 @@ define void @v_shuffle_v3p3_v4p3__7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6672,15 +6481,15 @@ define void @v_shuffle_v3p3_v4p3__7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6688,15 +6497,15 @@ define void @v_shuffle_v3p3_v4p3__7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v7 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6759,12 +6568,13 @@ define void @v_shuffle_v3p3_v4p3__7_5_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_5_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6802,38 +6612,37 @@ define void @v_shuffle_v3p3_v4p3__u_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__u_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__u_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__u_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6920,16 +6729,15 @@ define void @v_shuffle_v3p3_v4p3__1_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6937,16 +6745,15 @@ define void @v_shuffle_v3p3_v4p3__1_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6967,9 +6774,9 @@ define void @v_shuffle_v3p3_v4p3__2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v6 -; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: global_store_dwordx3 v7, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6979,14 +6786,13 @@ define void @v_shuffle_v3p3_v4p3__2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v7 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6996,14 +6802,13 @@ define void @v_shuffle_v3p3_v4p3__2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v7 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v7 +; GFX942-NEXT: v_mov_b32_e32 v4, v7 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7021,12 +6826,12 @@ define void @v_shuffle_v3p3_v4p3__3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v7 -; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7034,16 +6839,15 @@ define void @v_shuffle_v3p3_v4p3__3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v7 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7051,16 +6855,16 @@ define void @v_shuffle_v3p3_v4p3__3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v7 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7135,11 +6939,10 @@ define void @v_shuffle_v3p3_v4p3__5_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7149,11 +6952,10 @@ define void @v_shuffle_v3p3_v4p3__5_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7167,41 +6969,37 @@ define void @v_shuffle_v3p3_v4p3__6_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__6_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__6_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__6_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7215,12 +7013,12 @@ define void @v_shuffle_v3p3_v4p3__7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_u_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7261,15 +7059,14 @@ define void @v_shuffle_v3p3_v4p3__7_0_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7319,15 +7116,14 @@ define void @v_shuffle_v3p3_v4p3__7_1_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7375,15 +7171,14 @@ define void @v_shuffle_v3p3_v4p3__7_2_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7432,15 +7227,14 @@ define void @v_shuffle_v3p3_v4p3__7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7450,14 +7244,13 @@ define void @v_shuffle_v3p3_v4p3__7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v7 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7467,14 +7260,13 @@ define void @v_shuffle_v3p3_v4p3__7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v7 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v7 +; GFX942-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7488,13 +7280,13 @@ define void @v_shuffle_v3p3_v4p3__7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_4_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7537,13 +7329,13 @@ define void @v_shuffle_v3p3_v4p3__7_5_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_5_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7583,12 +7375,13 @@ define void @v_shuffle_v3p3_v4p3__7_6_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_6_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll index c7092f04a23ed..2a0344fce9f44 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v4f32_v2f32__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__1_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v2, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -111,12 +110,11 @@ define void @v_shuffle_v4f32_v2f32__3_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v2, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -154,15 +152,14 @@ define void @v_shuffle_v4f32_v2f32__3_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_0_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -207,15 +204,14 @@ define void @v_shuffle_v4f32_v2f32__3_1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v3, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -262,10 +258,10 @@ define void @v_shuffle_v4f32_v2f32__3_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx4 v3, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -306,12 +302,12 @@ define void @v_shuffle_v4f32_v2f32__3_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx4 v3, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -349,15 +345,15 @@ define void @v_shuffle_v4f32_v2f32__3_3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -402,14 +398,14 @@ define void @v_shuffle_v4f32_v2f32__3_3_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -457,11 +453,11 @@ define void @v_shuffle_v4f32_v2f32__3_3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -502,13 +498,13 @@ define void @v_shuffle_v4f32_v2f32__3_3_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -548,16 +544,16 @@ define void @v_shuffle_v4f32_v2f32__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -604,15 +600,15 @@ define void @v_shuffle_v4f32_v2f32__3_3_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -660,12 +656,12 @@ define void @v_shuffle_v4f32_v2f32__3_3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -705,14 +701,14 @@ define void @v_shuffle_v4f32_v2f32__3_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -847,14 +843,14 @@ define void @v_shuffle_v4f32_v2f32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__1_0_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -939,16 +935,15 @@ define void @v_shuffle_v4f32_v2f32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -995,15 +990,14 @@ define void @v_shuffle_v4f32_v2f32__3_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v2, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1050,16 +1044,15 @@ define void @v_shuffle_v4f32_v2f32__3_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1108,15 +1101,15 @@ define void @v_shuffle_v4f32_v2f32__3_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1163,15 +1156,15 @@ define void @v_shuffle_v4f32_v2f32__3_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1217,16 +1210,15 @@ define void @v_shuffle_v4f32_v2f32__3_3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v3, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1272,16 +1264,16 @@ define void @v_shuffle_v4f32_v2f32__3_3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1327,17 +1319,16 @@ define void @v_shuffle_v4f32_v2f32__3_3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1477,14 +1468,14 @@ define void @v_shuffle_v4f32_v2f32__1_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__1_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1571,17 +1562,16 @@ define void @v_shuffle_v4f32_v2f32__3_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1629,16 +1619,15 @@ define void @v_shuffle_v4f32_v2f32__3_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_u_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1684,16 +1673,15 @@ define void @v_shuffle_v4f32_v2f32__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_0_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1742,13 +1730,13 @@ define void @v_shuffle_v4f32_v2f32__3_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1795,15 +1783,15 @@ define void @v_shuffle_v4f32_v2f32__3_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1849,15 +1837,15 @@ define void @v_shuffle_v4f32_v2f32__3_3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1901,15 +1889,15 @@ define void @v_shuffle_v4f32_v2f32__3_3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1956,14 +1944,13 @@ define void @v_shuffle_v4f32_v2f32__3_3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2061,12 +2048,11 @@ define void @v_shuffle_v4f32_v2f32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__1_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v2, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2114,14 +2100,14 @@ define void @v_shuffle_v4f32_v2f32__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2161,13 +2147,13 @@ define void @v_shuffle_v4f32_v2f32__3_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_u_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v2, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2208,15 +2194,15 @@ define void @v_shuffle_v4f32_v2f32__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2263,15 +2249,15 @@ define void @v_shuffle_v4f32_v2f32__3_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2317,14 +2303,14 @@ define void @v_shuffle_v4f32_v2f32__3_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2366,14 +2352,13 @@ define void @v_shuffle_v4f32_v2f32__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v3, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2417,15 +2402,15 @@ define void @v_shuffle_v4f32_v2f32__3_3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2475,15 +2460,15 @@ define void @v_shuffle_v4f32_v2f32__3_3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2632,16 +2617,15 @@ define void @v_shuffle_v4f32_v2f32__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2736,13 +2720,13 @@ define void @v_shuffle_v4f32_v2f32__3_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_u_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v2, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2783,15 +2767,15 @@ define void @v_shuffle_v4f32_v2f32__3_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2838,15 +2822,15 @@ define void @v_shuffle_v4f32_v2f32__3_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2892,13 +2876,14 @@ define void @v_shuffle_v4f32_v2f32__3_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_2_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2938,13 +2923,13 @@ define void @v_shuffle_v4f32_v2f32__3_3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v3, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2985,15 +2970,15 @@ define void @v_shuffle_v4f32_v2f32__3_3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3040,15 +3025,15 @@ define void @v_shuffle_v4f32_v2f32__3_3_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3096,13 +3081,14 @@ define void @v_shuffle_v4f32_v2f32__3_3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll index 86211d4e3c3d8..504f4aad7e682 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v4f32_v3f32__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__1_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v3, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -100,36 +99,33 @@ define void @v_shuffle_v4f32_v3f32__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__2_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__2_u_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -153,12 +149,11 @@ define void @v_shuffle_v4f32_v3f32__4_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__4_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v3, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -196,36 +191,33 @@ define void @v_shuffle_v4f32_v3f32__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_u_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -239,48 +231,45 @@ define void @v_shuffle_v4f32_v3f32__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_0_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_0_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_0_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -294,46 +283,43 @@ define void @v_shuffle_v4f32_v3f32__5_1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_1_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_1_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -347,16 +333,14 @@ define void @v_shuffle_v4f32_v3f32__5_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_2_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -364,15 +348,14 @@ define void @v_shuffle_v4f32_v3f32__5_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -380,15 +363,14 @@ define void @v_shuffle_v4f32_v3f32__5_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -404,37 +386,35 @@ define void @v_shuffle_v4f32_v3f32__5_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_3_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -448,36 +428,37 @@ define void @v_shuffle_v4f32_v3f32__5_4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_4_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_4_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_4_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -491,39 +472,37 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -537,51 +516,46 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_0_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_0_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -596,15 +570,14 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -612,16 +585,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -629,17 +601,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -654,15 +624,14 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -670,15 +639,14 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -686,16 +654,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -711,40 +678,38 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -758,42 +723,40 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -807,39 +770,40 @@ define void @v_shuffle_v4f32_v3f32__5_5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -853,50 +817,51 @@ define void @v_shuffle_v4f32_v3f32__5_5_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx4 v9, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v9, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: global_store_dwordx4 v9, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -911,15 +876,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -927,15 +892,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -943,16 +908,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -967,15 +932,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -983,16 +948,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1000,16 +965,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1025,43 +990,41 @@ define void @v_shuffle_v4f32_v3f32__5_5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1075,45 +1038,43 @@ define void @v_shuffle_v4f32_v3f32__5_5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1127,42 +1088,43 @@ define void @v_shuffle_v4f32_v3f32__5_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1189,29 +1151,26 @@ define void @v_shuffle_v4f32_v3f32__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__u_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__u_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1273,42 +1232,39 @@ define void @v_shuffle_v4f32_v3f32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__1_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__1_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1321,45 +1277,43 @@ define void @v_shuffle_v4f32_v3f32__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__2_0_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__2_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__2_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1385,29 +1339,26 @@ define void @v_shuffle_v4f32_v3f32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__3_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__3_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1421,16 +1372,15 @@ define void @v_shuffle_v4f32_v3f32__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1438,15 +1388,14 @@ define void @v_shuffle_v4f32_v3f32__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1455,16 +1404,15 @@ define void @v_shuffle_v4f32_v3f32__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -1480,16 +1428,15 @@ define void @v_shuffle_v4f32_v3f32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1497,17 +1444,15 @@ define void @v_shuffle_v4f32_v3f32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1515,17 +1460,15 @@ define void @v_shuffle_v4f32_v3f32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1540,49 +1483,44 @@ define void @v_shuffle_v4f32_v3f32__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v5, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v5, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1597,16 +1535,15 @@ define void @v_shuffle_v4f32_v3f32__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1614,17 +1551,15 @@ define void @v_shuffle_v4f32_v3f32__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1632,17 +1567,15 @@ define void @v_shuffle_v4f32_v3f32__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1657,16 +1590,15 @@ define void @v_shuffle_v4f32_v3f32__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1676,15 +1608,13 @@ define void @v_shuffle_v4f32_v3f32__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1694,15 +1624,13 @@ define void @v_shuffle_v4f32_v3f32__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1717,16 +1645,15 @@ define void @v_shuffle_v4f32_v3f32__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1734,17 +1661,15 @@ define void @v_shuffle_v4f32_v3f32__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1752,17 +1677,15 @@ define void @v_shuffle_v4f32_v3f32__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1777,15 +1700,15 @@ define void @v_shuffle_v4f32_v3f32__5_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1793,16 +1716,15 @@ define void @v_shuffle_v4f32_v3f32__5_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1810,16 +1732,15 @@ define void @v_shuffle_v4f32_v3f32__5_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1834,16 +1755,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1851,17 +1771,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1869,17 +1787,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1893,17 +1809,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1911,16 +1825,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1928,16 +1841,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1951,17 +1863,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1969,16 +1880,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1986,17 +1896,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2010,53 +1918,51 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 ; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v9, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v9, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v8 ; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v9, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2070,53 +1976,51 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx4 v9, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v9, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: global_store_dwordx4 v9, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2130,17 +2034,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2148,16 +2051,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2165,17 +2067,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2282,11 +2183,11 @@ define void @v_shuffle_v4f32_v3f32__1_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2327,42 +2228,43 @@ define void @v_shuffle_v4f32_v3f32__2_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__2_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__2_1_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__2_1_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2421,16 +2323,15 @@ define void @v_shuffle_v4f32_v3f32__4_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2480,16 +2381,15 @@ define void @v_shuffle_v4f32_v3f32__5_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2497,16 +2397,15 @@ define void @v_shuffle_v4f32_v3f32__5_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2514,17 +2413,15 @@ define void @v_shuffle_v4f32_v3f32__5_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2539,15 +2436,14 @@ define void @v_shuffle_v4f32_v3f32__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2555,16 +2451,14 @@ define void @v_shuffle_v4f32_v3f32__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2572,17 +2466,14 @@ define void @v_shuffle_v4f32_v3f32__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2597,15 +2488,14 @@ define void @v_shuffle_v4f32_v3f32__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2613,17 +2503,15 @@ define void @v_shuffle_v4f32_v3f32__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2631,17 +2519,15 @@ define void @v_shuffle_v4f32_v3f32__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2656,16 +2542,15 @@ define void @v_shuffle_v4f32_v3f32__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2675,15 +2560,13 @@ define void @v_shuffle_v4f32_v3f32__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2693,15 +2576,13 @@ define void @v_shuffle_v4f32_v3f32__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2718,14 +2599,13 @@ define void @v_shuffle_v4f32_v3f32__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2733,17 +2613,15 @@ define void @v_shuffle_v4f32_v3f32__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2751,17 +2629,15 @@ define void @v_shuffle_v4f32_v3f32__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2776,15 +2652,15 @@ define void @v_shuffle_v4f32_v3f32__5_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2792,15 +2668,15 @@ define void @v_shuffle_v4f32_v3f32__5_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2808,16 +2684,15 @@ define void @v_shuffle_v4f32_v3f32__5_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2832,16 +2707,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2849,17 +2723,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2867,17 +2739,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2891,51 +2761,46 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_u_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2949,52 +2814,46 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_0_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_0_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3009,16 +2868,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3028,15 +2886,13 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3046,15 +2902,13 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3069,16 +2923,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3086,17 +2939,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3104,17 +2955,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3129,16 +2979,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3146,17 +2995,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3164,17 +3011,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3188,13 +3034,13 @@ define void @v_shuffle_v4f32_v3f32__u_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__u_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3278,14 +3124,13 @@ define void @v_shuffle_v4f32_v3f32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__1_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3324,42 +3169,43 @@ define void @v_shuffle_v4f32_v3f32__2_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__2_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__2_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__2_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3372,13 +3218,13 @@ define void @v_shuffle_v4f32_v3f32__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__3_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3418,16 +3264,15 @@ define void @v_shuffle_v4f32_v3f32__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3474,17 +3319,16 @@ define void @v_shuffle_v4f32_v3f32__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3492,16 +3336,15 @@ define void @v_shuffle_v4f32_v3f32__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3509,16 +3352,15 @@ define void @v_shuffle_v4f32_v3f32__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3532,48 +3374,46 @@ define void @v_shuffle_v4f32_v3f32__5_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_u_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_u_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3588,15 +3428,14 @@ define void @v_shuffle_v4f32_v3f32__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3606,15 +3445,14 @@ define void @v_shuffle_v4f32_v3f32__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3624,15 +3462,14 @@ define void @v_shuffle_v4f32_v3f32__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3646,48 +3483,46 @@ define void @v_shuffle_v4f32_v3f32__5_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_1_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_1_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3704,14 +3539,13 @@ define void @v_shuffle_v4f32_v3f32__5_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3719,16 +3553,15 @@ define void @v_shuffle_v4f32_v3f32__5_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3736,16 +3569,15 @@ define void @v_shuffle_v4f32_v3f32__5_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3760,15 +3592,15 @@ define void @v_shuffle_v4f32_v3f32__5_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3778,14 +3610,13 @@ define void @v_shuffle_v4f32_v3f32__5_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3795,14 +3626,13 @@ define void @v_shuffle_v4f32_v3f32__5_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3817,16 +3647,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3834,16 +3663,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3851,16 +3679,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3875,15 +3702,14 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3891,16 +3717,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3908,16 +3733,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3932,16 +3756,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[5:7] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3951,15 +3774,13 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3969,15 +3790,13 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3992,15 +3811,14 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4008,16 +3826,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4025,16 +3842,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4049,53 +3866,49 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v6 -; GFX90A-NEXT: v_mov_b32_e32 v9, v6 -; GFX90A-NEXT: v_mov_b32_e32 v10, v4 -; GFX90A-NEXT: v_mov_b32_e32 v11, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v6 -; GFX942-NEXT: v_mov_b32_e32 v9, v6 -; GFX942-NEXT: v_mov_b32_e32 v10, v4 -; GFX942-NEXT: v_mov_b32_e32 v11, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4110,16 +3923,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4127,16 +3939,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4144,17 +3955,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4218,12 +4028,11 @@ define void @v_shuffle_v4f32_v3f32__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__1_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v3, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4260,36 +4069,33 @@ define void @v_shuffle_v4f32_v3f32__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__2_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__2_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__2_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4314,42 +4120,39 @@ define void @v_shuffle_v4f32_v3f32__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__4_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__4_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4363,45 +4166,43 @@ define void @v_shuffle_v4f32_v3f32__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4415,14 +4216,13 @@ define void @v_shuffle_v4f32_v3f32__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_u_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4464,15 +4264,15 @@ define void @v_shuffle_v4f32_v3f32__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4480,17 +4280,16 @@ define void @v_shuffle_v4f32_v3f32__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4498,17 +4297,16 @@ define void @v_shuffle_v4f32_v3f32__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4523,15 +4321,15 @@ define void @v_shuffle_v4f32_v3f32__5_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4539,16 +4337,15 @@ define void @v_shuffle_v4f32_v3f32__5_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4556,17 +4353,16 @@ define void @v_shuffle_v4f32_v3f32__5_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4581,51 +4377,49 @@ define void @v_shuffle_v4f32_v3f32__5_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v4 -; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v4 -; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4641,43 +4435,41 @@ define void @v_shuffle_v4f32_v3f32__5_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4693,43 +4485,41 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4743,43 +4533,40 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4794,15 +4581,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4810,17 +4597,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4828,17 +4613,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4853,15 +4637,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4869,16 +4653,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4886,17 +4669,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4911,15 +4693,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4927,16 +4709,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4944,17 +4725,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4970,40 +4750,39 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_4_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5120,16 +4899,15 @@ define void @v_shuffle_v4f32_v3f32__1_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5179,16 +4957,15 @@ define void @v_shuffle_v4f32_v3f32__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5196,16 +4973,15 @@ define void @v_shuffle_v4f32_v3f32__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5213,17 +4989,15 @@ define void @v_shuffle_v4f32_v3f32__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5286,11 +5060,11 @@ define void @v_shuffle_v4f32_v3f32__4_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5332,42 +5106,43 @@ define void @v_shuffle_v4f32_v3f32__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5381,13 +5156,13 @@ define void @v_shuffle_v4f32_v3f32__5_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_u_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5429,15 +5204,15 @@ define void @v_shuffle_v4f32_v3f32__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5445,17 +5220,16 @@ define void @v_shuffle_v4f32_v3f32__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5463,17 +5237,16 @@ define void @v_shuffle_v4f32_v3f32__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5488,15 +5261,15 @@ define void @v_shuffle_v4f32_v3f32__5_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5504,15 +5277,15 @@ define void @v_shuffle_v4f32_v3f32__5_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5520,16 +5293,16 @@ define void @v_shuffle_v4f32_v3f32__5_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5544,51 +5317,49 @@ define void @v_shuffle_v4f32_v3f32__5_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_2_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_2_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5602,44 +5373,43 @@ define void @v_shuffle_v4f32_v3f32__5_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_3_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_3_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5653,45 +5423,43 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5705,43 +5473,40 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5756,16 +5521,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5773,17 +5537,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5791,17 +5553,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5816,16 +5577,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5833,17 +5593,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5851,17 +5610,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5876,16 +5634,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5893,16 +5650,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5910,17 +5666,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5934,44 +5689,43 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5985,13 +5739,13 @@ define void @v_shuffle_v4f32_v3f32__u_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__u_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6090,16 +5844,15 @@ define void @v_shuffle_v4f32_v3f32__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6147,16 +5900,15 @@ define void @v_shuffle_v4f32_v3f32__2_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6164,16 +5916,16 @@ define void @v_shuffle_v4f32_v3f32__2_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6181,17 +5933,16 @@ define void @v_shuffle_v4f32_v3f32__2_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6251,14 +6002,13 @@ define void @v_shuffle_v4f32_v3f32__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__4_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6298,39 +6048,40 @@ define void @v_shuffle_v4f32_v3f32__5_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_u_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_u_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6345,16 +6096,15 @@ define void @v_shuffle_v4f32_v3f32__5_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6362,16 +6112,16 @@ define void @v_shuffle_v4f32_v3f32__5_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6379,16 +6129,16 @@ define void @v_shuffle_v4f32_v3f32__5_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6403,16 +6153,15 @@ define void @v_shuffle_v4f32_v3f32__5_1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6420,16 +6169,15 @@ define void @v_shuffle_v4f32_v3f32__5_1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6437,17 +6185,16 @@ define void @v_shuffle_v4f32_v3f32__5_1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6464,14 +6211,13 @@ define void @v_shuffle_v4f32_v3f32__5_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6481,14 +6227,14 @@ define void @v_shuffle_v4f32_v3f32__5_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6498,15 +6244,14 @@ define void @v_shuffle_v4f32_v3f32__5_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6520,44 +6265,43 @@ define void @v_shuffle_v4f32_v3f32__5_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_3_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_3_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6571,39 +6315,43 @@ define void @v_shuffle_v4f32_v3f32__5_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_4_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_4_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_4_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6617,42 +6365,40 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_u_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6667,16 +6413,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6684,17 +6429,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6702,17 +6445,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6727,16 +6469,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6746,14 +6487,13 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6763,15 +6503,14 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6786,16 +6525,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6803,16 +6541,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6820,17 +6557,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6844,45 +6580,43 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_3_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6896,42 +6630,41 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll index d5bd41397c4f0..5fdd57da3dab7 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v4f32_v4f32__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__1_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -100,36 +99,33 @@ define void @v_shuffle_v4f32_v4f32__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__2_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__2_u_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -142,12 +138,11 @@ define void @v_shuffle_v4f32_v4f32__3_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__3_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -195,12 +190,11 @@ define void @v_shuffle_v4f32_v4f32__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__5_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -238,36 +232,33 @@ define void @v_shuffle_v4f32_v4f32__6_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__6_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__6_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__6_u_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -281,12 +272,11 @@ define void @v_shuffle_v4f32_v4f32__7_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -324,16 +314,14 @@ define void @v_shuffle_v4f32_v4f32__7_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_0_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -378,15 +366,14 @@ define void @v_shuffle_v4f32_v4f32__7_1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -431,16 +418,14 @@ define void @v_shuffle_v4f32_v4f32__7_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_2_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -485,16 +470,14 @@ define void @v_shuffle_v4f32_v4f32__7_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -508,9 +491,8 @@ define void @v_shuffle_v4f32_v4f32__7_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -524,9 +506,9 @@ define void @v_shuffle_v4f32_v4f32__7_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -540,13 +522,12 @@ define void @v_shuffle_v4f32_v4f32__7_4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_4_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -584,12 +565,12 @@ define void @v_shuffle_v4f32_v4f32__7_5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_5_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -627,13 +608,12 @@ define void @v_shuffle_v4f32_v4f32__7_6_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_6_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -671,13 +651,12 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -688,9 +667,8 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -701,9 +679,8 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -717,17 +694,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -737,14 +712,12 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -754,15 +727,12 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -777,16 +747,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -794,16 +762,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -811,17 +778,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -836,15 +801,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -852,15 +816,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -868,16 +831,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_2_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -892,16 +854,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -909,16 +869,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -926,16 +885,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -949,43 +907,39 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -999,42 +953,39 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1048,13 +999,13 @@ define void @v_shuffle_v4f32_v4f32__7_7_6_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_6_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1094,14 +1045,13 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_7_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1111,11 +1061,10 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1125,11 +1074,10 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1143,18 +1091,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_7_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1162,16 +1108,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1179,17 +1124,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1204,17 +1147,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1222,17 +1163,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1240,18 +1179,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1266,17 +1202,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v6 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1284,16 +1218,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1301,17 +1234,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1326,16 +1258,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1343,16 +1274,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1360,17 +1290,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1384,14 +1313,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_7_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1431,46 +1360,42 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_7_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1484,15 +1409,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_7_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v2 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1532,14 +1456,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1549,11 +1473,11 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1563,11 +1487,11 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1582,13 +1506,12 @@ define void @v_shuffle_v4f32_v4f32__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1596,13 +1519,12 @@ define void @v_shuffle_v4f32_v4f32__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1610,13 +1532,12 @@ define void @v_shuffle_v4f32_v4f32__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1692,13 +1613,12 @@ define void @v_shuffle_v4f32_v4f32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1706,13 +1626,12 @@ define void @v_shuffle_v4f32_v4f32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1728,43 +1647,39 @@ define void @v_shuffle_v4f32_v4f32__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v0 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__2_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__2_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1777,15 +1692,14 @@ define void @v_shuffle_v4f32_v4f32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__3_0_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1793,13 +1707,12 @@ define void @v_shuffle_v4f32_v4f32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1807,13 +1720,12 @@ define void @v_shuffle_v4f32_v4f32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1827,13 +1739,12 @@ define void @v_shuffle_v4f32_v4f32__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1841,13 +1752,12 @@ define void @v_shuffle_v4f32_v4f32__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1855,13 +1765,12 @@ define void @v_shuffle_v4f32_v4f32__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1877,15 +1786,13 @@ define void @v_shuffle_v4f32_v4f32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1893,16 +1800,15 @@ define void @v_shuffle_v4f32_v4f32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1910,17 +1816,16 @@ define void @v_shuffle_v4f32_v4f32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1935,17 +1840,15 @@ define void @v_shuffle_v4f32_v4f32__6_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1953,17 +1856,15 @@ define void @v_shuffle_v4f32_v4f32__6_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1971,18 +1872,15 @@ define void @v_shuffle_v4f32_v4f32__6_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1997,17 +1895,15 @@ define void @v_shuffle_v4f32_v4f32__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2015,16 +1911,15 @@ define void @v_shuffle_v4f32_v4f32__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2032,17 +1927,16 @@ define void @v_shuffle_v4f32_v4f32__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2057,16 +1951,14 @@ define void @v_shuffle_v4f32_v4f32__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2074,16 +1966,15 @@ define void @v_shuffle_v4f32_v4f32__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2091,17 +1982,15 @@ define void @v_shuffle_v4f32_v4f32__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2116,17 +2005,15 @@ define void @v_shuffle_v4f32_v4f32__7_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v0 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2134,17 +2021,16 @@ define void @v_shuffle_v4f32_v4f32__7_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2152,18 +2038,16 @@ define void @v_shuffle_v4f32_v4f32__7_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2178,16 +2062,15 @@ define void @v_shuffle_v4f32_v4f32__7_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v7, v[1:4], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2195,16 +2078,15 @@ define void @v_shuffle_v4f32_v4f32__7_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2212,16 +2094,15 @@ define void @v_shuffle_v4f32_v4f32__7_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2236,17 +2117,15 @@ define void @v_shuffle_v4f32_v4f32__7_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v8, v[1:4], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2295,17 +2174,15 @@ define void @v_shuffle_v4f32_v4f32__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v0 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2313,16 +2190,15 @@ define void @v_shuffle_v4f32_v4f32__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2330,17 +2206,15 @@ define void @v_shuffle_v4f32_v4f32__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2355,16 +2229,15 @@ define void @v_shuffle_v4f32_v4f32__7_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2414,17 +2287,15 @@ define void @v_shuffle_v4f32_v4f32__7_6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2432,16 +2303,15 @@ define void @v_shuffle_v4f32_v4f32__7_6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2449,17 +2319,15 @@ define void @v_shuffle_v4f32_v4f32__7_6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2474,17 +2342,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2492,17 +2358,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2510,18 +2374,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2535,17 +2396,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2553,16 +2412,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2570,17 +2428,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2594,18 +2450,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, v0 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2613,16 +2467,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2630,17 +2483,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2654,18 +2505,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v8 +; GFX900-NEXT: global_store_dwordx4 v10, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2673,17 +2522,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2691,17 +2539,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_2_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v7 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v8 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2715,17 +2562,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v8, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2733,16 +2579,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2750,16 +2595,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2773,18 +2617,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, v0 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2794,15 +2636,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v2 -; GFX90A-NEXT: v_mov_b32_e32 v11, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2812,15 +2653,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v11, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v2 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2834,17 +2674,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2852,16 +2691,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2869,17 +2707,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2893,17 +2729,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_6_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_6_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3045,11 +2880,11 @@ define void @v_shuffle_v4f32_v4f32__1_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3093,11 +2928,11 @@ define void @v_shuffle_v4f32_v4f32__2_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3107,11 +2942,11 @@ define void @v_shuffle_v4f32_v4f32__2_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3121,11 +2956,11 @@ define void @v_shuffle_v4f32_v4f32__2_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -3138,14 +2973,14 @@ define void @v_shuffle_v4f32_v4f32__3_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__3_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3232,16 +3067,15 @@ define void @v_shuffle_v4f32_v4f32__5_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3291,16 +3125,15 @@ define void @v_shuffle_v4f32_v4f32__6_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3308,16 +3141,15 @@ define void @v_shuffle_v4f32_v4f32__6_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3325,17 +3157,15 @@ define void @v_shuffle_v4f32_v4f32__6_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -3350,16 +3180,15 @@ define void @v_shuffle_v4f32_v4f32__7_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3409,16 +3238,14 @@ define void @v_shuffle_v4f32_v4f32__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3426,16 +3253,15 @@ define void @v_shuffle_v4f32_v4f32__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3443,17 +3269,15 @@ define void @v_shuffle_v4f32_v4f32__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -3468,17 +3292,14 @@ define void @v_shuffle_v4f32_v4f32__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3486,16 +3307,15 @@ define void @v_shuffle_v4f32_v4f32__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3503,17 +3323,16 @@ define void @v_shuffle_v4f32_v4f32__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -3528,17 +3347,15 @@ define void @v_shuffle_v4f32_v4f32__7_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v1 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3546,16 +3363,15 @@ define void @v_shuffle_v4f32_v4f32__7_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3563,16 +3379,15 @@ define void @v_shuffle_v4f32_v4f32__7_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -3587,16 +3402,15 @@ define void @v_shuffle_v4f32_v4f32__7_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3645,17 +3459,15 @@ define void @v_shuffle_v4f32_v4f32__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v1 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3663,16 +3475,15 @@ define void @v_shuffle_v4f32_v4f32__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3680,17 +3491,15 @@ define void @v_shuffle_v4f32_v4f32__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -3705,16 +3514,15 @@ define void @v_shuffle_v4f32_v4f32__7_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 ; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3764,17 +3572,15 @@ define void @v_shuffle_v4f32_v4f32__7_6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3782,16 +3588,15 @@ define void @v_shuffle_v4f32_v4f32__7_6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3799,17 +3604,15 @@ define void @v_shuffle_v4f32_v4f32__7_6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -3824,17 +3627,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3842,17 +3643,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3860,18 +3659,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -3885,17 +3681,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3905,14 +3699,12 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3922,15 +3714,12 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -3944,18 +3733,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3965,15 +3751,12 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3983,16 +3766,12 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4007,17 +3786,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: v_mov_b32_e32 v6, v1 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4027,15 +3804,13 @@ define void @v_shuffle_v4f32_v4f32__7_7_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4045,15 +3820,13 @@ define void @v_shuffle_v4f32_v4f32__7_7_2_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v7 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4068,17 +3841,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v8 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4088,15 +3859,13 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4106,15 +3875,13 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: v_mov_b32_e32 v4, v7 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4129,17 +3896,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: v_mov_b32_e32 v6, v1 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4147,17 +3912,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v2 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4165,17 +3928,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v2 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4190,16 +3951,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4207,17 +3967,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v3 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4225,17 +3983,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4250,16 +4006,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_6_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4311,10 +4066,10 @@ define void @v_shuffle_v4f32_v4f32__u_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4401,11 +4156,10 @@ define void @v_shuffle_v4f32_v4f32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4447,11 +4201,11 @@ define void @v_shuffle_v4f32_v4f32__2_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4461,11 +4215,11 @@ define void @v_shuffle_v4f32_v4f32__2_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4475,11 +4229,11 @@ define void @v_shuffle_v4f32_v4f32__2_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4492,14 +4246,14 @@ define void @v_shuffle_v4f32_v4f32__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__3_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4541,10 +4295,10 @@ define void @v_shuffle_v4f32_v4f32__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4584,16 +4338,15 @@ define void @v_shuffle_v4f32_v4f32__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4641,16 +4394,15 @@ define void @v_shuffle_v4f32_v4f32__6_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4658,16 +4410,15 @@ define void @v_shuffle_v4f32_v4f32__6_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4675,16 +4426,15 @@ define void @v_shuffle_v4f32_v4f32__6_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4699,16 +4449,15 @@ define void @v_shuffle_v4f32_v4f32__7_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4756,19 +4505,18 @@ define void @v_shuffle_v4f32_v4f32__7_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_u_2_2: +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_u_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -4811,17 +4559,14 @@ define void @v_shuffle_v4f32_v4f32__7_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4868,15 +4613,14 @@ define void @v_shuffle_v4f32_v4f32__7_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4923,16 +4667,15 @@ define void @v_shuffle_v4f32_v4f32__7_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4981,16 +4724,15 @@ define void @v_shuffle_v4f32_v4f32__7_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5037,15 +4779,14 @@ define void @v_shuffle_v4f32_v4f32__7_5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -5095,16 +4836,15 @@ define void @v_shuffle_v4f32_v4f32__7_6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5151,16 +4891,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5168,16 +4907,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5185,16 +4923,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5209,16 +4946,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5226,16 +4961,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5243,16 +4977,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5267,17 +5000,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v0 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5287,15 +5018,13 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5305,15 +5034,13 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v7 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5328,17 +5055,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5346,16 +5070,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[6:7] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5363,16 +5086,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[6:7] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5393,11 +5115,9 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v2 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5405,16 +5125,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5422,16 +5141,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5446,54 +5164,49 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v6 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v2 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, v7 -; GFX90A-NEXT: v_mov_b32_e32 v11, v7 -; GFX90A-NEXT: v_mov_b32_e32 v12, v4 -; GFX90A-NEXT: v_mov_b32_e32 v13, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v7 -; GFX942-NEXT: v_mov_b32_e32 v11, v7 -; GFX942-NEXT: v_mov_b32_e32 v12, v4 -; GFX942-NEXT: v_mov_b32_e32 v13, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5508,16 +5221,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v9, 0 -; GFX900-NEXT: v_mov_b32_e32 v5, v6 -; GFX900-NEXT: v_mov_b32_e32 v7, v4 -; GFX900-NEXT: v_mov_b32_e32 v8, v2 -; GFX900-NEXT: global_store_dwordx4 v9, v[5:8], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5525,16 +5237,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5542,17 +5253,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5567,16 +5277,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_6_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5624,39 +5333,40 @@ define void @v_shuffle_v4f32_v4f32__u_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__u_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__u_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__u_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5717,11 +5427,10 @@ define void @v_shuffle_v4f32_v4f32__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5731,11 +5440,11 @@ define void @v_shuffle_v4f32_v4f32__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5745,11 +5454,11 @@ define void @v_shuffle_v4f32_v4f32__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5762,42 +5471,40 @@ define void @v_shuffle_v4f32_v4f32__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__2_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__2_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__2_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5810,14 +5517,14 @@ define void @v_shuffle_v4f32_v4f32__3_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__3_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5827,11 +5534,11 @@ define void @v_shuffle_v4f32_v4f32__3_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5841,11 +5548,11 @@ define void @v_shuffle_v4f32_v4f32__3_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5858,39 +5565,40 @@ define void @v_shuffle_v4f32_v4f32__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__4_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__4_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__4_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5904,16 +5612,15 @@ define void @v_shuffle_v4f32_v4f32__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5923,14 +5630,14 @@ define void @v_shuffle_v4f32_v4f32__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5940,14 +5647,14 @@ define void @v_shuffle_v4f32_v4f32__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5962,16 +5669,15 @@ define void @v_shuffle_v4f32_v4f32__6_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5979,16 +5685,15 @@ define void @v_shuffle_v4f32_v4f32__6_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5996,16 +5701,15 @@ define void @v_shuffle_v4f32_v4f32__6_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -6019,17 +5723,16 @@ define void @v_shuffle_v4f32_v4f32__7_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6039,14 +5742,14 @@ define void @v_shuffle_v4f32_v4f32__7_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6056,14 +5759,14 @@ define void @v_shuffle_v4f32_v4f32__7_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -6077,16 +5780,15 @@ define void @v_shuffle_v4f32_v4f32__7_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_u_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6136,14 +5838,12 @@ define void @v_shuffle_v4f32_v4f32__7_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v0 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6190,15 +5890,14 @@ define void @v_shuffle_v4f32_v4f32__7_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6244,17 +5943,15 @@ define void @v_shuffle_v4f32_v4f32__7_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_2_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6302,16 +5999,15 @@ define void @v_shuffle_v4f32_v4f32__7_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6358,16 +6054,15 @@ define void @v_shuffle_v4f32_v4f32__7_5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6416,16 +6111,15 @@ define void @v_shuffle_v4f32_v4f32__7_6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6472,16 +6166,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6489,16 +6182,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6506,16 +6198,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -6530,15 +6221,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6546,15 +6236,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6562,16 +6251,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -6586,17 +6274,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6606,15 +6292,13 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6624,15 +6308,13 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v7 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -6653,11 +6335,9 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v1 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6667,15 +6347,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6685,15 +6364,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v7 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -6708,15 +6386,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6724,15 +6401,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6740,16 +6416,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_2_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -6764,16 +6439,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6781,16 +6455,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6798,17 +6471,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -6823,16 +6495,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6840,16 +6511,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6857,17 +6527,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -6882,16 +6551,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_6_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6989,12 +6657,11 @@ define void @v_shuffle_v4f32_v4f32__1_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__1_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7031,36 +6698,33 @@ define void @v_shuffle_v4f32_v4f32__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__2_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__2_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__2_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7073,12 +6737,11 @@ define void @v_shuffle_v4f32_v4f32__3_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__3_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7141,13 +6804,12 @@ define void @v_shuffle_v4f32_v4f32__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7155,13 +6817,12 @@ define void @v_shuffle_v4f32_v4f32__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7178,43 +6839,39 @@ define void @v_shuffle_v4f32_v4f32__6_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v0 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__6_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__6_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7228,15 +6885,14 @@ define void @v_shuffle_v4f32_v4f32__7_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7244,13 +6900,12 @@ define void @v_shuffle_v4f32_v4f32__7_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7258,13 +6913,12 @@ define void @v_shuffle_v4f32_v4f32__7_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7278,43 +6932,39 @@ define void @v_shuffle_v4f32_v4f32__7_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_u_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_u_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_u_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7329,17 +6979,15 @@ define void @v_shuffle_v4f32_v4f32__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7352,11 +7000,10 @@ define void @v_shuffle_v4f32_v4f32__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7366,15 +7013,14 @@ define void @v_shuffle_v4f32_v4f32__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7389,15 +7035,15 @@ define void @v_shuffle_v4f32_v4f32__7_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7407,14 +7053,13 @@ define void @v_shuffle_v4f32_v4f32__7_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7424,15 +7069,14 @@ define void @v_shuffle_v4f32_v4f32__7_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7447,16 +7091,15 @@ define void @v_shuffle_v4f32_v4f32__7_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7464,15 +7107,14 @@ define void @v_shuffle_v4f32_v4f32__7_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[6:7] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -7481,16 +7123,15 @@ define void @v_shuffle_v4f32_v4f32__7_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[6:7] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -7506,17 +7147,15 @@ define void @v_shuffle_v4f32_v4f32__7_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7526,15 +7165,13 @@ define void @v_shuffle_v4f32_v4f32__7_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7544,15 +7181,13 @@ define void @v_shuffle_v4f32_v4f32__7_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7566,46 +7201,42 @@ define void @v_shuffle_v4f32_v4f32__7_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_5_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 ; GFX900-NEXT: v_mov_b32_e32 v5, v0 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7619,14 +7250,14 @@ define void @v_shuffle_v4f32_v4f32__7_6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_6_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7634,13 +7265,12 @@ define void @v_shuffle_v4f32_v4f32__7_6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7648,13 +7278,12 @@ define void @v_shuffle_v4f32_v4f32__7_6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7668,46 +7297,42 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7721,14 +7346,13 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7769,17 +7393,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7787,17 +7409,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v2 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7805,17 +7425,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v2 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7830,17 +7448,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7848,16 +7464,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7865,17 +7480,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7890,15 +7503,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7906,16 +7519,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7923,17 +7535,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7948,17 +7559,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7966,16 +7575,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[4:5] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7983,17 +7591,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[4:5] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -8007,14 +7614,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, v0 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8054,15 +7661,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_6_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: v_mov_b32_e32 v7, v0 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8157,12 +7763,12 @@ define void @v_shuffle_v4f32_v4f32__0_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8216,10 +7822,9 @@ define void @v_shuffle_v4f32_v4f32__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 ; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8275,10 +7880,9 @@ define void @v_shuffle_v4f32_v4f32__2_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 ; GFX900-NEXT: v_mov_b32_e32 v5, v4 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8286,16 +7890,15 @@ define void @v_shuffle_v4f32_v4f32__2_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8303,16 +7906,15 @@ define void @v_shuffle_v4f32_v4f32__2_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -8330,13 +7932,12 @@ define void @v_shuffle_v4f32_v4f32__3_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v5 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8433,11 +8034,11 @@ define void @v_shuffle_v4f32_v4f32__5_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8482,11 +8083,11 @@ define void @v_shuffle_v4f32_v4f32__6_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8496,11 +8097,11 @@ define void @v_shuffle_v4f32_v4f32__6_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8510,11 +8111,11 @@ define void @v_shuffle_v4f32_v4f32__6_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -8528,14 +8129,14 @@ define void @v_shuffle_v4f32_v4f32__7_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8577,14 +8178,13 @@ define void @v_shuffle_v4f32_v4f32__7_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_u_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8592,13 +8192,12 @@ define void @v_shuffle_v4f32_v4f32__7_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8606,13 +8205,12 @@ define void @v_shuffle_v4f32_v4f32__7_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -8627,17 +8225,15 @@ define void @v_shuffle_v4f32_v4f32__7_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8685,15 +8281,15 @@ define void @v_shuffle_v4f32_v4f32__7_1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8741,17 +8337,15 @@ define void @v_shuffle_v4f32_v4f32__7_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8759,15 +8353,14 @@ define void @v_shuffle_v4f32_v4f32__7_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[6:7] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8776,16 +8369,15 @@ define void @v_shuffle_v4f32_v4f32__7_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[6:7] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -8801,17 +8393,15 @@ define void @v_shuffle_v4f32_v4f32__7_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8821,15 +8411,13 @@ define void @v_shuffle_v4f32_v4f32__7_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8839,15 +8427,13 @@ define void @v_shuffle_v4f32_v4f32__7_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -8861,15 +8447,14 @@ define void @v_shuffle_v4f32_v4f32__7_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_4_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8877,13 +8462,12 @@ define void @v_shuffle_v4f32_v4f32__7_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8891,13 +8475,12 @@ define void @v_shuffle_v4f32_v4f32__7_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -8926,13 +8509,12 @@ define void @v_shuffle_v4f32_v4f32__7_6_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8940,13 +8522,12 @@ define void @v_shuffle_v4f32_v4f32__7_6_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -8960,45 +8541,42 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -9012,13 +8590,13 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9059,16 +8637,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v5, v0 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9076,17 +8653,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9094,17 +8669,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -9119,16 +8692,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: v_mov_b32_e32 v6, v1 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9136,17 +8708,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v1 -; GFX90A-NEXT: v_mov_b32_e32 v11, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9154,17 +8725,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -9179,16 +8749,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9196,16 +8765,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_2_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9213,17 +8781,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_2_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -9238,17 +8805,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9256,17 +8821,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9274,17 +8838,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -9298,45 +8861,42 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -9350,46 +8910,42 @@ define void @v_shuffle_v4f32_v4f32__7_7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_6_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_6_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_6_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -9406,10 +8962,10 @@ define void @v_shuffle_v4f32_v4f32__u_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9514,10 +9070,9 @@ define void @v_shuffle_v4f32_v4f32__1_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v5, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9571,10 +9126,9 @@ define void @v_shuffle_v4f32_v4f32__2_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: v_mov_b32_e32 v6, v5 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9582,16 +9136,16 @@ define void @v_shuffle_v4f32_v4f32__2_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 ; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: v_mov_b32_e32 v7, v6 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9599,16 +9153,16 @@ define void @v_shuffle_v4f32_v4f32__2_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v6 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -9626,13 +9180,12 @@ define void @v_shuffle_v4f32_v4f32__3_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v6 -; GFX900-NEXT: v_mov_b32_e32 v7, v6 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9728,11 +9281,10 @@ define void @v_shuffle_v4f32_v4f32__5_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9775,11 +9327,11 @@ define void @v_shuffle_v4f32_v4f32__6_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9789,11 +9341,11 @@ define void @v_shuffle_v4f32_v4f32__6_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9803,11 +9355,11 @@ define void @v_shuffle_v4f32_v4f32__6_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -9821,14 +9373,14 @@ define void @v_shuffle_v4f32_v4f32__7_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_6_6_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9868,13 +9420,13 @@ define void @v_shuffle_v4f32_v4f32__7_u_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_u_6_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9915,16 +9467,15 @@ define void @v_shuffle_v4f32_v4f32__7_0_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9972,16 +9523,15 @@ define void @v_shuffle_v4f32_v4f32__7_1_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10031,15 +9581,14 @@ define void @v_shuffle_v4f32_v4f32__7_2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 ; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -10088,16 +9637,15 @@ define void @v_shuffle_v4f32_v4f32__7_3_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v6 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10105,16 +9653,16 @@ define void @v_shuffle_v4f32_v4f32__7_3_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v6 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10122,16 +9670,16 @@ define void @v_shuffle_v4f32_v4f32__7_3_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v7 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v6 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -10192,13 +9740,14 @@ define void @v_shuffle_v4f32_v4f32__7_5_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_5_6_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10238,14 +9787,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_6_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10287,14 +9836,13 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_u_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10337,17 +9885,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, v4 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v0 -; GFX900-NEXT: v_mov_b32_e32 v8, v3 -; GFX900-NEXT: global_store_dwordx4 v9, v[5:8], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10355,17 +9901,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v4 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10373,17 +9918,17 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v4 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: v_mov_b32_e32 v1, v7 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -10398,17 +9943,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v5 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v8, v1 -; GFX900-NEXT: v_mov_b32_e32 v9, v4 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10416,16 +9959,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10433,17 +9975,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -10458,16 +9998,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_2_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10517,17 +10056,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10535,16 +10072,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[6:7] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10552,17 +10088,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[6:7] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -10576,46 +10111,42 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_4_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v2 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -10629,15 +10160,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_5_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v1 -; GFX900-NEXT: v_mov_b32_e32 v7, v2 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10677,39 +10207,40 @@ define void @v_shuffle_v4f32_v4f32__u_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__u_7_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__u_7_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__u_7_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -10789,10 +10320,10 @@ define void @v_shuffle_v4f32_v4f32__1_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10800,16 +10331,16 @@ define void @v_shuffle_v4f32_v4f32__1_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10817,17 +10348,16 @@ define void @v_shuffle_v4f32_v4f32__1_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -10848,10 +10378,10 @@ define void @v_shuffle_v4f32_v4f32__2_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v6 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10859,16 +10389,16 @@ define void @v_shuffle_v4f32_v4f32__2_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 ; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10876,16 +10406,16 @@ define void @v_shuffle_v4f32_v4f32__2_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v7 +; GFX942-NEXT: v_mov_b32_e32 v4, v7 ; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -10903,13 +10433,13 @@ define void @v_shuffle_v4f32_v4f32__3_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10917,16 +10447,16 @@ define void @v_shuffle_v4f32_v4f32__3_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 ; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10934,16 +10464,17 @@ define void @v_shuffle_v4f32_v4f32__3_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, v7 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 ; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -11006,11 +10537,10 @@ define void @v_shuffle_v4f32_v4f32__5_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11020,11 +10550,11 @@ define void @v_shuffle_v4f32_v4f32__5_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11034,11 +10564,11 @@ define void @v_shuffle_v4f32_v4f32__5_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -11052,42 +10582,40 @@ define void @v_shuffle_v4f32_v4f32__6_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__6_7_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__6_7_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__6_7_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -11101,13 +10629,13 @@ define void @v_shuffle_v4f32_v4f32__7_u_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_u_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11148,16 +10676,15 @@ define void @v_shuffle_v4f32_v4f32__7_0_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11205,16 +10732,15 @@ define void @v_shuffle_v4f32_v4f32__7_1_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11264,15 +10790,14 @@ define void @v_shuffle_v4f32_v4f32__7_2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 ; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -11321,16 +10846,15 @@ define void @v_shuffle_v4f32_v4f32__7_3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11338,16 +10862,16 @@ define void @v_shuffle_v4f32_v4f32__7_3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 ; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11355,16 +10879,16 @@ define void @v_shuffle_v4f32_v4f32__7_3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 ; GFX942-NEXT: v_mov_b32_e32 v4, v7 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v7 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -11378,15 +10902,14 @@ define void @v_shuffle_v4f32_v4f32__7_4_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_4_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11426,13 +10949,14 @@ define void @v_shuffle_v4f32_v4f32__7_5_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_5_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11472,14 +10996,14 @@ define void @v_shuffle_v4f32_v4f32__7_6_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_6_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11519,13 +11043,13 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_u_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11536,9 +11060,9 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11549,9 +11073,9 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -11566,16 +11090,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11583,16 +11106,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11600,17 +11122,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -11625,16 +11145,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11642,16 +11161,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11659,17 +11178,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -11684,16 +11202,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_2_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11701,16 +11218,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_2_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11718,17 +11234,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_2_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -11743,16 +11258,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11760,16 +11274,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11777,16 +11291,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v7 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -11800,46 +11314,42 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_4_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -11853,46 +11363,42 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_5_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v1 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_5_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -11906,13 +11412,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_6_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_6_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll index 03503c9dac197..ad2dd3a8f8073 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v4i32_v2i32__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__1_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v2, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -111,12 +110,11 @@ define void @v_shuffle_v4i32_v2i32__3_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v2, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -154,15 +152,14 @@ define void @v_shuffle_v4i32_v2i32__3_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_0_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -207,15 +204,14 @@ define void @v_shuffle_v4i32_v2i32__3_1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v3, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -262,10 +258,10 @@ define void @v_shuffle_v4i32_v2i32__3_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx4 v3, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -306,12 +302,12 @@ define void @v_shuffle_v4i32_v2i32__3_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx4 v3, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -349,15 +345,15 @@ define void @v_shuffle_v4i32_v2i32__3_3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -402,14 +398,14 @@ define void @v_shuffle_v4i32_v2i32__3_3_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -457,11 +453,11 @@ define void @v_shuffle_v4i32_v2i32__3_3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -502,13 +498,13 @@ define void @v_shuffle_v4i32_v2i32__3_3_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -548,16 +544,16 @@ define void @v_shuffle_v4i32_v2i32__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -604,15 +600,15 @@ define void @v_shuffle_v4i32_v2i32__3_3_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -660,12 +656,12 @@ define void @v_shuffle_v4i32_v2i32__3_3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -705,14 +701,14 @@ define void @v_shuffle_v4i32_v2i32__3_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -850,14 +846,14 @@ define void @v_shuffle_v4i32_v2i32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__1_0_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -945,16 +941,15 @@ define void @v_shuffle_v4i32_v2i32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1001,15 +996,14 @@ define void @v_shuffle_v4i32_v2i32__3_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v2, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1056,16 +1050,15 @@ define void @v_shuffle_v4i32_v2i32__3_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1114,15 +1107,15 @@ define void @v_shuffle_v4i32_v2i32__3_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1169,15 +1162,15 @@ define void @v_shuffle_v4i32_v2i32__3_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1223,16 +1216,15 @@ define void @v_shuffle_v4i32_v2i32__3_3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v3, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1278,16 +1270,16 @@ define void @v_shuffle_v4i32_v2i32__3_3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1333,17 +1325,16 @@ define void @v_shuffle_v4i32_v2i32__3_3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1483,14 +1474,14 @@ define void @v_shuffle_v4i32_v2i32__1_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__1_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1577,17 +1568,16 @@ define void @v_shuffle_v4i32_v2i32__3_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1635,16 +1625,15 @@ define void @v_shuffle_v4i32_v2i32__3_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_u_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1690,16 +1679,15 @@ define void @v_shuffle_v4i32_v2i32__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_0_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1748,13 +1736,13 @@ define void @v_shuffle_v4i32_v2i32__3_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1801,15 +1789,15 @@ define void @v_shuffle_v4i32_v2i32__3_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1855,15 +1843,15 @@ define void @v_shuffle_v4i32_v2i32__3_3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1907,15 +1895,15 @@ define void @v_shuffle_v4i32_v2i32__3_3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1962,14 +1950,13 @@ define void @v_shuffle_v4i32_v2i32__3_3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2067,12 +2054,11 @@ define void @v_shuffle_v4i32_v2i32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__1_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v2, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2120,14 +2106,14 @@ define void @v_shuffle_v4i32_v2i32__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2167,13 +2153,13 @@ define void @v_shuffle_v4i32_v2i32__3_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_u_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v2, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2214,15 +2200,15 @@ define void @v_shuffle_v4i32_v2i32__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2269,15 +2255,15 @@ define void @v_shuffle_v4i32_v2i32__3_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2323,14 +2309,14 @@ define void @v_shuffle_v4i32_v2i32__3_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2372,14 +2358,13 @@ define void @v_shuffle_v4i32_v2i32__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v3, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2423,15 +2408,15 @@ define void @v_shuffle_v4i32_v2i32__3_3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2481,15 +2466,15 @@ define void @v_shuffle_v4i32_v2i32__3_3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2638,16 +2623,15 @@ define void @v_shuffle_v4i32_v2i32__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2742,13 +2726,13 @@ define void @v_shuffle_v4i32_v2i32__3_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_u_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v2, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2789,15 +2773,15 @@ define void @v_shuffle_v4i32_v2i32__3_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2844,15 +2828,15 @@ define void @v_shuffle_v4i32_v2i32__3_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2898,13 +2882,14 @@ define void @v_shuffle_v4i32_v2i32__3_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_2_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2944,13 +2929,13 @@ define void @v_shuffle_v4i32_v2i32__3_3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v3, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2991,15 +2976,15 @@ define void @v_shuffle_v4i32_v2i32__3_3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3046,15 +3031,15 @@ define void @v_shuffle_v4i32_v2i32__3_3_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3102,13 +3087,14 @@ define void @v_shuffle_v4i32_v2i32__3_3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll index fc6d2a84d4892..2cf0f5c030d74 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v4i32_v3i32__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__1_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v3, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -100,36 +99,33 @@ define void @v_shuffle_v4i32_v3i32__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__2_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__2_u_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -153,12 +149,11 @@ define void @v_shuffle_v4i32_v3i32__4_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__4_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v3, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -196,36 +191,33 @@ define void @v_shuffle_v4i32_v3i32__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_u_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -239,48 +231,45 @@ define void @v_shuffle_v4i32_v3i32__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_0_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_0_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_0_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -294,46 +283,43 @@ define void @v_shuffle_v4i32_v3i32__5_1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_1_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_1_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -347,16 +333,14 @@ define void @v_shuffle_v4i32_v3i32__5_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_2_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -364,15 +348,14 @@ define void @v_shuffle_v4i32_v3i32__5_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -380,15 +363,14 @@ define void @v_shuffle_v4i32_v3i32__5_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -404,37 +386,35 @@ define void @v_shuffle_v4i32_v3i32__5_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_3_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -448,36 +428,37 @@ define void @v_shuffle_v4i32_v3i32__5_4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_4_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_4_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_4_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -491,39 +472,37 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -537,51 +516,46 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_0_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_0_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -596,15 +570,14 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -612,16 +585,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -629,17 +601,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -654,15 +624,14 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -670,15 +639,14 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -686,16 +654,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -711,40 +678,38 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -758,42 +723,40 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -807,39 +770,40 @@ define void @v_shuffle_v4i32_v3i32__5_5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -853,50 +817,51 @@ define void @v_shuffle_v4i32_v3i32__5_5_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx4 v9, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v9, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: global_store_dwordx4 v9, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -911,15 +876,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -927,15 +892,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -943,16 +908,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -967,15 +932,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -983,16 +948,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1000,16 +965,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1025,43 +990,41 @@ define void @v_shuffle_v4i32_v3i32__5_5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1075,45 +1038,43 @@ define void @v_shuffle_v4i32_v3i32__5_5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1127,42 +1088,43 @@ define void @v_shuffle_v4i32_v3i32__5_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1189,29 +1151,26 @@ define void @v_shuffle_v4i32_v3i32__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__u_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__u_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1273,42 +1232,39 @@ define void @v_shuffle_v4i32_v3i32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__1_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__1_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1321,45 +1277,43 @@ define void @v_shuffle_v4i32_v3i32__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__2_0_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__2_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__2_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1385,29 +1339,26 @@ define void @v_shuffle_v4i32_v3i32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__3_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__3_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1421,16 +1372,15 @@ define void @v_shuffle_v4i32_v3i32__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1438,15 +1388,14 @@ define void @v_shuffle_v4i32_v3i32__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1455,16 +1404,15 @@ define void @v_shuffle_v4i32_v3i32__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -1480,16 +1428,15 @@ define void @v_shuffle_v4i32_v3i32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1497,17 +1444,15 @@ define void @v_shuffle_v4i32_v3i32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1515,17 +1460,15 @@ define void @v_shuffle_v4i32_v3i32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1540,49 +1483,44 @@ define void @v_shuffle_v4i32_v3i32__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v5, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v5, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1597,16 +1535,15 @@ define void @v_shuffle_v4i32_v3i32__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1614,17 +1551,15 @@ define void @v_shuffle_v4i32_v3i32__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1632,17 +1567,15 @@ define void @v_shuffle_v4i32_v3i32__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1657,16 +1590,15 @@ define void @v_shuffle_v4i32_v3i32__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1676,15 +1608,13 @@ define void @v_shuffle_v4i32_v3i32__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1694,15 +1624,13 @@ define void @v_shuffle_v4i32_v3i32__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1717,16 +1645,15 @@ define void @v_shuffle_v4i32_v3i32__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1734,17 +1661,15 @@ define void @v_shuffle_v4i32_v3i32__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1752,17 +1677,15 @@ define void @v_shuffle_v4i32_v3i32__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1777,15 +1700,15 @@ define void @v_shuffle_v4i32_v3i32__5_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1793,16 +1716,15 @@ define void @v_shuffle_v4i32_v3i32__5_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1810,16 +1732,15 @@ define void @v_shuffle_v4i32_v3i32__5_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1834,16 +1755,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1851,17 +1771,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1869,17 +1787,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1893,17 +1809,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1911,16 +1825,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1928,16 +1841,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1951,17 +1863,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1969,16 +1880,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1986,17 +1896,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2010,53 +1918,51 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 ; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v9, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v9, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v8 ; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v9, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2070,53 +1976,51 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx4 v9, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v9, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: global_store_dwordx4 v9, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2130,17 +2034,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2148,16 +2051,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2165,17 +2067,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2282,11 +2183,11 @@ define void @v_shuffle_v4i32_v3i32__1_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2327,42 +2228,43 @@ define void @v_shuffle_v4i32_v3i32__2_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__2_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__2_1_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__2_1_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2421,16 +2323,15 @@ define void @v_shuffle_v4i32_v3i32__4_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2480,16 +2381,15 @@ define void @v_shuffle_v4i32_v3i32__5_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2497,16 +2397,15 @@ define void @v_shuffle_v4i32_v3i32__5_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2514,17 +2413,15 @@ define void @v_shuffle_v4i32_v3i32__5_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2539,15 +2436,14 @@ define void @v_shuffle_v4i32_v3i32__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2555,16 +2451,14 @@ define void @v_shuffle_v4i32_v3i32__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2572,17 +2466,14 @@ define void @v_shuffle_v4i32_v3i32__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2597,15 +2488,14 @@ define void @v_shuffle_v4i32_v3i32__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2613,17 +2503,15 @@ define void @v_shuffle_v4i32_v3i32__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2631,17 +2519,15 @@ define void @v_shuffle_v4i32_v3i32__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2656,16 +2542,15 @@ define void @v_shuffle_v4i32_v3i32__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2675,15 +2560,13 @@ define void @v_shuffle_v4i32_v3i32__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2693,15 +2576,13 @@ define void @v_shuffle_v4i32_v3i32__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2718,14 +2599,13 @@ define void @v_shuffle_v4i32_v3i32__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2733,17 +2613,15 @@ define void @v_shuffle_v4i32_v3i32__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2751,17 +2629,15 @@ define void @v_shuffle_v4i32_v3i32__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2776,15 +2652,15 @@ define void @v_shuffle_v4i32_v3i32__5_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2792,15 +2668,15 @@ define void @v_shuffle_v4i32_v3i32__5_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2808,16 +2684,15 @@ define void @v_shuffle_v4i32_v3i32__5_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2832,16 +2707,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2849,17 +2723,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2867,17 +2739,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2891,51 +2761,46 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_u_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2949,52 +2814,46 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_0_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_0_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3009,16 +2868,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3028,15 +2886,13 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3046,15 +2902,13 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3069,16 +2923,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3086,17 +2939,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3104,17 +2955,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3129,16 +2979,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3146,17 +2995,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3164,17 +3011,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3188,13 +3034,13 @@ define void @v_shuffle_v4i32_v3i32__u_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__u_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3278,14 +3124,13 @@ define void @v_shuffle_v4i32_v3i32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__1_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3324,42 +3169,43 @@ define void @v_shuffle_v4i32_v3i32__2_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__2_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__2_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__2_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3372,13 +3218,13 @@ define void @v_shuffle_v4i32_v3i32__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__3_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3418,16 +3264,15 @@ define void @v_shuffle_v4i32_v3i32__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3474,17 +3319,16 @@ define void @v_shuffle_v4i32_v3i32__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3492,16 +3336,15 @@ define void @v_shuffle_v4i32_v3i32__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3509,16 +3352,15 @@ define void @v_shuffle_v4i32_v3i32__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3532,48 +3374,46 @@ define void @v_shuffle_v4i32_v3i32__5_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_u_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_u_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3588,15 +3428,14 @@ define void @v_shuffle_v4i32_v3i32__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3606,15 +3445,14 @@ define void @v_shuffle_v4i32_v3i32__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3624,15 +3462,14 @@ define void @v_shuffle_v4i32_v3i32__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3646,48 +3483,46 @@ define void @v_shuffle_v4i32_v3i32__5_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_1_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_1_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3704,14 +3539,13 @@ define void @v_shuffle_v4i32_v3i32__5_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3719,16 +3553,15 @@ define void @v_shuffle_v4i32_v3i32__5_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3736,16 +3569,15 @@ define void @v_shuffle_v4i32_v3i32__5_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3760,15 +3592,15 @@ define void @v_shuffle_v4i32_v3i32__5_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3778,14 +3610,13 @@ define void @v_shuffle_v4i32_v3i32__5_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3795,14 +3626,13 @@ define void @v_shuffle_v4i32_v3i32__5_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3817,16 +3647,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3834,16 +3663,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3851,16 +3679,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3875,15 +3702,14 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3891,16 +3717,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3908,16 +3733,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3932,16 +3756,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[5:7] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3951,15 +3774,13 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3969,15 +3790,13 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3992,15 +3811,14 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4008,16 +3826,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4025,16 +3842,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4049,53 +3866,49 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v6 -; GFX90A-NEXT: v_mov_b32_e32 v9, v6 -; GFX90A-NEXT: v_mov_b32_e32 v10, v4 -; GFX90A-NEXT: v_mov_b32_e32 v11, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v6 -; GFX942-NEXT: v_mov_b32_e32 v9, v6 -; GFX942-NEXT: v_mov_b32_e32 v10, v4 -; GFX942-NEXT: v_mov_b32_e32 v11, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4110,16 +3923,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4127,16 +3939,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4144,17 +3955,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4218,12 +4028,11 @@ define void @v_shuffle_v4i32_v3i32__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__1_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v3, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4260,36 +4069,33 @@ define void @v_shuffle_v4i32_v3i32__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__2_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__2_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__2_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4314,42 +4120,39 @@ define void @v_shuffle_v4i32_v3i32__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__4_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__4_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4363,45 +4166,43 @@ define void @v_shuffle_v4i32_v3i32__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4415,14 +4216,13 @@ define void @v_shuffle_v4i32_v3i32__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_u_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4464,15 +4264,15 @@ define void @v_shuffle_v4i32_v3i32__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4480,17 +4280,16 @@ define void @v_shuffle_v4i32_v3i32__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4498,17 +4297,16 @@ define void @v_shuffle_v4i32_v3i32__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4523,15 +4321,15 @@ define void @v_shuffle_v4i32_v3i32__5_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4539,16 +4337,15 @@ define void @v_shuffle_v4i32_v3i32__5_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4556,17 +4353,16 @@ define void @v_shuffle_v4i32_v3i32__5_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4581,51 +4377,49 @@ define void @v_shuffle_v4i32_v3i32__5_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v4 -; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v4 -; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4641,43 +4435,41 @@ define void @v_shuffle_v4i32_v3i32__5_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4693,43 +4485,41 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4743,43 +4533,40 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4794,15 +4581,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4810,17 +4597,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4828,17 +4613,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4853,15 +4637,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4869,16 +4653,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4886,17 +4669,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4911,15 +4693,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4927,16 +4709,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4944,17 +4725,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4970,40 +4750,39 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_4_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5120,16 +4899,15 @@ define void @v_shuffle_v4i32_v3i32__1_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5179,16 +4957,15 @@ define void @v_shuffle_v4i32_v3i32__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5196,16 +4973,15 @@ define void @v_shuffle_v4i32_v3i32__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5213,17 +4989,15 @@ define void @v_shuffle_v4i32_v3i32__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5286,11 +5060,11 @@ define void @v_shuffle_v4i32_v3i32__4_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5332,42 +5106,43 @@ define void @v_shuffle_v4i32_v3i32__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5381,13 +5156,13 @@ define void @v_shuffle_v4i32_v3i32__5_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_u_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5429,15 +5204,15 @@ define void @v_shuffle_v4i32_v3i32__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5445,17 +5220,16 @@ define void @v_shuffle_v4i32_v3i32__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5463,17 +5237,16 @@ define void @v_shuffle_v4i32_v3i32__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5488,15 +5261,15 @@ define void @v_shuffle_v4i32_v3i32__5_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5504,15 +5277,15 @@ define void @v_shuffle_v4i32_v3i32__5_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5520,16 +5293,16 @@ define void @v_shuffle_v4i32_v3i32__5_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5544,51 +5317,49 @@ define void @v_shuffle_v4i32_v3i32__5_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_2_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_2_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5602,44 +5373,43 @@ define void @v_shuffle_v4i32_v3i32__5_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_3_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_3_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5653,45 +5423,43 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5705,43 +5473,40 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5756,16 +5521,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5773,17 +5537,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5791,17 +5553,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5816,16 +5577,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5833,17 +5593,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5851,17 +5610,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5876,16 +5634,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5893,16 +5650,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5910,17 +5666,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5934,44 +5689,43 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5985,13 +5739,13 @@ define void @v_shuffle_v4i32_v3i32__u_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__u_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6090,16 +5844,15 @@ define void @v_shuffle_v4i32_v3i32__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6147,16 +5900,15 @@ define void @v_shuffle_v4i32_v3i32__2_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6164,16 +5916,16 @@ define void @v_shuffle_v4i32_v3i32__2_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6181,17 +5933,16 @@ define void @v_shuffle_v4i32_v3i32__2_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6251,14 +6002,13 @@ define void @v_shuffle_v4i32_v3i32__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__4_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6298,39 +6048,40 @@ define void @v_shuffle_v4i32_v3i32__5_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_u_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_u_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6345,16 +6096,15 @@ define void @v_shuffle_v4i32_v3i32__5_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6362,16 +6112,16 @@ define void @v_shuffle_v4i32_v3i32__5_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6379,16 +6129,16 @@ define void @v_shuffle_v4i32_v3i32__5_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6403,16 +6153,15 @@ define void @v_shuffle_v4i32_v3i32__5_1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6420,16 +6169,15 @@ define void @v_shuffle_v4i32_v3i32__5_1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6437,17 +6185,16 @@ define void @v_shuffle_v4i32_v3i32__5_1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6464,14 +6211,13 @@ define void @v_shuffle_v4i32_v3i32__5_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6481,14 +6227,14 @@ define void @v_shuffle_v4i32_v3i32__5_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6498,15 +6244,14 @@ define void @v_shuffle_v4i32_v3i32__5_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6520,44 +6265,43 @@ define void @v_shuffle_v4i32_v3i32__5_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_3_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_3_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6571,39 +6315,43 @@ define void @v_shuffle_v4i32_v3i32__5_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_4_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_4_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_4_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6617,42 +6365,40 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_u_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6667,16 +6413,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6684,17 +6429,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6702,17 +6445,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6727,16 +6469,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6746,14 +6487,13 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6763,15 +6503,14 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6786,16 +6525,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6803,16 +6541,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6820,17 +6557,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6844,45 +6580,43 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_3_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6896,42 +6630,41 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll index ee2f94b90ffa9..c7d7bf9fa1623 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v4i32_v4i32__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__1_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -100,36 +99,33 @@ define void @v_shuffle_v4i32_v4i32__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__2_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__2_u_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -142,12 +138,11 @@ define void @v_shuffle_v4i32_v4i32__3_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__3_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -195,12 +190,11 @@ define void @v_shuffle_v4i32_v4i32__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__5_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -238,36 +232,33 @@ define void @v_shuffle_v4i32_v4i32__6_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__6_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__6_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__6_u_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -281,12 +272,11 @@ define void @v_shuffle_v4i32_v4i32__7_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -324,16 +314,14 @@ define void @v_shuffle_v4i32_v4i32__7_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_0_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -378,15 +366,14 @@ define void @v_shuffle_v4i32_v4i32__7_1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -431,16 +418,14 @@ define void @v_shuffle_v4i32_v4i32__7_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_2_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -485,16 +470,14 @@ define void @v_shuffle_v4i32_v4i32__7_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -508,9 +491,8 @@ define void @v_shuffle_v4i32_v4i32__7_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -524,9 +506,9 @@ define void @v_shuffle_v4i32_v4i32__7_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -540,13 +522,12 @@ define void @v_shuffle_v4i32_v4i32__7_4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_4_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -584,12 +565,12 @@ define void @v_shuffle_v4i32_v4i32__7_5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_5_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -627,13 +608,12 @@ define void @v_shuffle_v4i32_v4i32__7_6_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_6_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -671,13 +651,12 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -688,9 +667,8 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -701,9 +679,8 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -717,17 +694,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -737,14 +712,12 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -754,15 +727,12 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -777,16 +747,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -794,16 +762,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -811,17 +778,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -836,15 +801,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -852,15 +816,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -868,16 +831,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_2_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -892,16 +854,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -909,16 +869,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -926,16 +885,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -949,43 +907,39 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -999,42 +953,39 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1048,13 +999,13 @@ define void @v_shuffle_v4i32_v4i32__7_7_6_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_6_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1094,14 +1045,13 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_7_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1111,11 +1061,10 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1125,11 +1074,10 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1143,18 +1091,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_7_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1162,16 +1108,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1179,17 +1124,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1204,17 +1147,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1222,17 +1163,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1240,18 +1179,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1266,17 +1202,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v6 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1284,16 +1218,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1301,17 +1234,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1326,16 +1258,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1343,16 +1274,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1360,17 +1290,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1384,14 +1313,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_7_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1431,46 +1360,42 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_7_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1484,15 +1409,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_7_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v2 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1532,14 +1456,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1549,11 +1473,11 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1563,11 +1487,11 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1582,13 +1506,12 @@ define void @v_shuffle_v4i32_v4i32__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1596,13 +1519,12 @@ define void @v_shuffle_v4i32_v4i32__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1610,13 +1532,12 @@ define void @v_shuffle_v4i32_v4i32__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1692,13 +1613,12 @@ define void @v_shuffle_v4i32_v4i32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1706,13 +1626,12 @@ define void @v_shuffle_v4i32_v4i32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1728,43 +1647,39 @@ define void @v_shuffle_v4i32_v4i32__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v0 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__2_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__2_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1777,15 +1692,14 @@ define void @v_shuffle_v4i32_v4i32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__3_0_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1793,13 +1707,12 @@ define void @v_shuffle_v4i32_v4i32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1807,13 +1720,12 @@ define void @v_shuffle_v4i32_v4i32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1827,13 +1739,12 @@ define void @v_shuffle_v4i32_v4i32__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1841,13 +1752,12 @@ define void @v_shuffle_v4i32_v4i32__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1855,13 +1765,12 @@ define void @v_shuffle_v4i32_v4i32__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1877,15 +1786,13 @@ define void @v_shuffle_v4i32_v4i32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1893,16 +1800,15 @@ define void @v_shuffle_v4i32_v4i32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1910,17 +1816,16 @@ define void @v_shuffle_v4i32_v4i32__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1935,17 +1840,15 @@ define void @v_shuffle_v4i32_v4i32__6_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1953,17 +1856,15 @@ define void @v_shuffle_v4i32_v4i32__6_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1971,18 +1872,15 @@ define void @v_shuffle_v4i32_v4i32__6_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1997,17 +1895,15 @@ define void @v_shuffle_v4i32_v4i32__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2015,16 +1911,15 @@ define void @v_shuffle_v4i32_v4i32__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2032,17 +1927,16 @@ define void @v_shuffle_v4i32_v4i32__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2057,16 +1951,14 @@ define void @v_shuffle_v4i32_v4i32__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2074,16 +1966,15 @@ define void @v_shuffle_v4i32_v4i32__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2091,17 +1982,15 @@ define void @v_shuffle_v4i32_v4i32__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2116,17 +2005,15 @@ define void @v_shuffle_v4i32_v4i32__7_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v0 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2134,17 +2021,16 @@ define void @v_shuffle_v4i32_v4i32__7_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2152,18 +2038,16 @@ define void @v_shuffle_v4i32_v4i32__7_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2178,16 +2062,15 @@ define void @v_shuffle_v4i32_v4i32__7_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v7, v[1:4], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2195,16 +2078,15 @@ define void @v_shuffle_v4i32_v4i32__7_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2212,16 +2094,15 @@ define void @v_shuffle_v4i32_v4i32__7_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2236,17 +2117,15 @@ define void @v_shuffle_v4i32_v4i32__7_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v8, v[1:4], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2295,17 +2174,15 @@ define void @v_shuffle_v4i32_v4i32__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v0 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2313,16 +2190,15 @@ define void @v_shuffle_v4i32_v4i32__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2330,17 +2206,15 @@ define void @v_shuffle_v4i32_v4i32__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2355,16 +2229,15 @@ define void @v_shuffle_v4i32_v4i32__7_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2414,17 +2287,15 @@ define void @v_shuffle_v4i32_v4i32__7_6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2432,16 +2303,15 @@ define void @v_shuffle_v4i32_v4i32__7_6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2449,17 +2319,15 @@ define void @v_shuffle_v4i32_v4i32__7_6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2474,17 +2342,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2492,17 +2358,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2510,18 +2374,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2535,17 +2396,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2553,16 +2412,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2570,17 +2428,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2594,18 +2450,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, v0 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2613,16 +2467,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2630,17 +2483,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2654,18 +2505,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v8 +; GFX900-NEXT: global_store_dwordx4 v10, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2673,17 +2522,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2691,17 +2539,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_2_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v7 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v8 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2715,17 +2562,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v8, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2733,16 +2579,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2750,16 +2595,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2773,18 +2617,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, v0 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2794,15 +2636,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v2 -; GFX90A-NEXT: v_mov_b32_e32 v11, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2812,15 +2653,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v11, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v2 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2834,17 +2674,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2852,16 +2691,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2869,17 +2707,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2893,17 +2729,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_6_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_6_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3045,11 +2880,11 @@ define void @v_shuffle_v4i32_v4i32__1_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3093,11 +2928,11 @@ define void @v_shuffle_v4i32_v4i32__2_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3107,11 +2942,11 @@ define void @v_shuffle_v4i32_v4i32__2_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3121,11 +2956,11 @@ define void @v_shuffle_v4i32_v4i32__2_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -3138,14 +2973,14 @@ define void @v_shuffle_v4i32_v4i32__3_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__3_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3232,16 +3067,15 @@ define void @v_shuffle_v4i32_v4i32__5_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3291,16 +3125,15 @@ define void @v_shuffle_v4i32_v4i32__6_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3308,16 +3141,15 @@ define void @v_shuffle_v4i32_v4i32__6_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3325,17 +3157,15 @@ define void @v_shuffle_v4i32_v4i32__6_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -3350,16 +3180,15 @@ define void @v_shuffle_v4i32_v4i32__7_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3409,16 +3238,14 @@ define void @v_shuffle_v4i32_v4i32__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3426,16 +3253,15 @@ define void @v_shuffle_v4i32_v4i32__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3443,17 +3269,15 @@ define void @v_shuffle_v4i32_v4i32__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -3468,17 +3292,14 @@ define void @v_shuffle_v4i32_v4i32__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3486,16 +3307,15 @@ define void @v_shuffle_v4i32_v4i32__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3503,17 +3323,16 @@ define void @v_shuffle_v4i32_v4i32__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -3528,17 +3347,15 @@ define void @v_shuffle_v4i32_v4i32__7_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v1 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3546,16 +3363,15 @@ define void @v_shuffle_v4i32_v4i32__7_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3563,16 +3379,15 @@ define void @v_shuffle_v4i32_v4i32__7_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -3587,16 +3402,15 @@ define void @v_shuffle_v4i32_v4i32__7_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3645,17 +3459,15 @@ define void @v_shuffle_v4i32_v4i32__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v1 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3663,16 +3475,15 @@ define void @v_shuffle_v4i32_v4i32__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3680,17 +3491,15 @@ define void @v_shuffle_v4i32_v4i32__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -3705,16 +3514,15 @@ define void @v_shuffle_v4i32_v4i32__7_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 ; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3764,17 +3572,15 @@ define void @v_shuffle_v4i32_v4i32__7_6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3782,16 +3588,15 @@ define void @v_shuffle_v4i32_v4i32__7_6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3799,17 +3604,15 @@ define void @v_shuffle_v4i32_v4i32__7_6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -3824,17 +3627,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3842,17 +3643,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3860,18 +3659,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -3885,17 +3681,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3905,14 +3699,12 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3922,15 +3714,12 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -3944,18 +3733,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3965,15 +3751,12 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3983,16 +3766,12 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4007,17 +3786,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: v_mov_b32_e32 v6, v1 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4027,15 +3804,13 @@ define void @v_shuffle_v4i32_v4i32__7_7_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4045,15 +3820,13 @@ define void @v_shuffle_v4i32_v4i32__7_7_2_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v7 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4068,17 +3841,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v8 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4088,15 +3859,13 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4106,15 +3875,13 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: v_mov_b32_e32 v4, v7 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4129,17 +3896,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: v_mov_b32_e32 v6, v1 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4147,17 +3912,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v2 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4165,17 +3928,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v2 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4190,16 +3951,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4207,17 +3967,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v3 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4225,17 +3983,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4250,16 +4006,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_6_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4311,10 +4066,10 @@ define void @v_shuffle_v4i32_v4i32__u_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4401,11 +4156,10 @@ define void @v_shuffle_v4i32_v4i32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4447,11 +4201,11 @@ define void @v_shuffle_v4i32_v4i32__2_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4461,11 +4215,11 @@ define void @v_shuffle_v4i32_v4i32__2_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4475,11 +4229,11 @@ define void @v_shuffle_v4i32_v4i32__2_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4492,14 +4246,14 @@ define void @v_shuffle_v4i32_v4i32__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__3_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4541,10 +4295,10 @@ define void @v_shuffle_v4i32_v4i32__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4584,16 +4338,15 @@ define void @v_shuffle_v4i32_v4i32__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4641,16 +4394,15 @@ define void @v_shuffle_v4i32_v4i32__6_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4658,16 +4410,15 @@ define void @v_shuffle_v4i32_v4i32__6_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4675,16 +4426,15 @@ define void @v_shuffle_v4i32_v4i32__6_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4699,16 +4449,15 @@ define void @v_shuffle_v4i32_v4i32__7_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4756,19 +4505,18 @@ define void @v_shuffle_v4i32_v4i32__7_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_u_2_2: +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_u_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -4811,17 +4559,14 @@ define void @v_shuffle_v4i32_v4i32__7_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4868,15 +4613,14 @@ define void @v_shuffle_v4i32_v4i32__7_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4923,16 +4667,15 @@ define void @v_shuffle_v4i32_v4i32__7_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4981,16 +4724,15 @@ define void @v_shuffle_v4i32_v4i32__7_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5037,15 +4779,14 @@ define void @v_shuffle_v4i32_v4i32__7_5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -5095,16 +4836,15 @@ define void @v_shuffle_v4i32_v4i32__7_6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5151,16 +4891,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5168,16 +4907,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5185,16 +4923,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5209,16 +4946,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5226,16 +4961,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5243,16 +4977,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5267,17 +5000,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v0 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5287,15 +5018,13 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5305,15 +5034,13 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v7 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5328,17 +5055,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5346,16 +5070,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[6:7] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5363,16 +5086,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[6:7] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5393,11 +5115,9 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v2 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5405,16 +5125,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5422,16 +5141,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5446,54 +5164,49 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v6 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v2 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, v7 -; GFX90A-NEXT: v_mov_b32_e32 v11, v7 -; GFX90A-NEXT: v_mov_b32_e32 v12, v4 -; GFX90A-NEXT: v_mov_b32_e32 v13, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v7 -; GFX942-NEXT: v_mov_b32_e32 v11, v7 -; GFX942-NEXT: v_mov_b32_e32 v12, v4 -; GFX942-NEXT: v_mov_b32_e32 v13, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5508,16 +5221,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v9, 0 -; GFX900-NEXT: v_mov_b32_e32 v5, v6 -; GFX900-NEXT: v_mov_b32_e32 v7, v4 -; GFX900-NEXT: v_mov_b32_e32 v8, v2 -; GFX900-NEXT: global_store_dwordx4 v9, v[5:8], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5525,16 +5237,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5542,17 +5253,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5567,16 +5277,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_6_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5624,39 +5333,40 @@ define void @v_shuffle_v4i32_v4i32__u_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__u_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__u_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__u_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5717,11 +5427,10 @@ define void @v_shuffle_v4i32_v4i32__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5731,11 +5440,11 @@ define void @v_shuffle_v4i32_v4i32__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5745,11 +5454,11 @@ define void @v_shuffle_v4i32_v4i32__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5762,42 +5471,40 @@ define void @v_shuffle_v4i32_v4i32__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__2_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__2_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__2_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5810,14 +5517,14 @@ define void @v_shuffle_v4i32_v4i32__3_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__3_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5827,11 +5534,11 @@ define void @v_shuffle_v4i32_v4i32__3_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5841,11 +5548,11 @@ define void @v_shuffle_v4i32_v4i32__3_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5858,39 +5565,40 @@ define void @v_shuffle_v4i32_v4i32__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__4_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__4_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__4_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5904,16 +5612,15 @@ define void @v_shuffle_v4i32_v4i32__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5923,14 +5630,14 @@ define void @v_shuffle_v4i32_v4i32__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5940,14 +5647,14 @@ define void @v_shuffle_v4i32_v4i32__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5962,16 +5669,15 @@ define void @v_shuffle_v4i32_v4i32__6_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5979,16 +5685,15 @@ define void @v_shuffle_v4i32_v4i32__6_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5996,16 +5701,15 @@ define void @v_shuffle_v4i32_v4i32__6_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -6019,17 +5723,16 @@ define void @v_shuffle_v4i32_v4i32__7_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6039,14 +5742,14 @@ define void @v_shuffle_v4i32_v4i32__7_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6056,14 +5759,14 @@ define void @v_shuffle_v4i32_v4i32__7_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -6077,16 +5780,15 @@ define void @v_shuffle_v4i32_v4i32__7_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_u_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6136,14 +5838,12 @@ define void @v_shuffle_v4i32_v4i32__7_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v0 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6190,15 +5890,14 @@ define void @v_shuffle_v4i32_v4i32__7_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6244,17 +5943,15 @@ define void @v_shuffle_v4i32_v4i32__7_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_2_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6302,16 +5999,15 @@ define void @v_shuffle_v4i32_v4i32__7_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6358,16 +6054,15 @@ define void @v_shuffle_v4i32_v4i32__7_5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6416,16 +6111,15 @@ define void @v_shuffle_v4i32_v4i32__7_6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6472,16 +6166,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6489,16 +6182,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6506,16 +6198,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -6530,15 +6221,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6546,15 +6236,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6562,16 +6251,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -6586,17 +6274,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6606,15 +6292,13 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6624,15 +6308,13 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v7 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -6653,11 +6335,9 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v1 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6667,15 +6347,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6685,15 +6364,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v7 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -6708,15 +6386,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6724,15 +6401,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6740,16 +6416,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_2_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -6764,16 +6439,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6781,16 +6455,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6798,17 +6471,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -6823,16 +6495,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6840,16 +6511,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6857,17 +6527,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -6882,16 +6551,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_6_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6989,12 +6657,11 @@ define void @v_shuffle_v4i32_v4i32__1_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__1_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7031,36 +6698,33 @@ define void @v_shuffle_v4i32_v4i32__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__2_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__2_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__2_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7073,12 +6737,11 @@ define void @v_shuffle_v4i32_v4i32__3_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__3_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7141,13 +6804,12 @@ define void @v_shuffle_v4i32_v4i32__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7155,13 +6817,12 @@ define void @v_shuffle_v4i32_v4i32__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7178,43 +6839,39 @@ define void @v_shuffle_v4i32_v4i32__6_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v0 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__6_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__6_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7228,15 +6885,14 @@ define void @v_shuffle_v4i32_v4i32__7_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7244,13 +6900,12 @@ define void @v_shuffle_v4i32_v4i32__7_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7258,13 +6913,12 @@ define void @v_shuffle_v4i32_v4i32__7_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7278,43 +6932,39 @@ define void @v_shuffle_v4i32_v4i32__7_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_u_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_u_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_u_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7329,17 +6979,15 @@ define void @v_shuffle_v4i32_v4i32__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7352,11 +7000,10 @@ define void @v_shuffle_v4i32_v4i32__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7366,15 +7013,14 @@ define void @v_shuffle_v4i32_v4i32__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7389,15 +7035,15 @@ define void @v_shuffle_v4i32_v4i32__7_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7407,14 +7053,13 @@ define void @v_shuffle_v4i32_v4i32__7_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7424,15 +7069,14 @@ define void @v_shuffle_v4i32_v4i32__7_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7447,16 +7091,15 @@ define void @v_shuffle_v4i32_v4i32__7_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7464,15 +7107,14 @@ define void @v_shuffle_v4i32_v4i32__7_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[6:7] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -7481,16 +7123,15 @@ define void @v_shuffle_v4i32_v4i32__7_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[6:7] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -7506,17 +7147,15 @@ define void @v_shuffle_v4i32_v4i32__7_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7526,15 +7165,13 @@ define void @v_shuffle_v4i32_v4i32__7_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7544,15 +7181,13 @@ define void @v_shuffle_v4i32_v4i32__7_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7566,46 +7201,42 @@ define void @v_shuffle_v4i32_v4i32__7_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_5_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 ; GFX900-NEXT: v_mov_b32_e32 v5, v0 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7619,14 +7250,14 @@ define void @v_shuffle_v4i32_v4i32__7_6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_6_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7634,13 +7265,12 @@ define void @v_shuffle_v4i32_v4i32__7_6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7648,13 +7278,12 @@ define void @v_shuffle_v4i32_v4i32__7_6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7668,46 +7297,42 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7721,14 +7346,13 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7769,17 +7393,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7787,17 +7409,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v2 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7805,17 +7425,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v2 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7830,17 +7448,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7848,16 +7464,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7865,17 +7480,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7890,15 +7503,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7906,16 +7519,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7923,17 +7535,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7948,17 +7559,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7966,16 +7575,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[4:5] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7983,17 +7591,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[4:5] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -8007,14 +7614,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, v0 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8054,15 +7661,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_6_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: v_mov_b32_e32 v7, v0 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8157,12 +7763,12 @@ define void @v_shuffle_v4i32_v4i32__0_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8216,10 +7822,9 @@ define void @v_shuffle_v4i32_v4i32__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 ; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8275,10 +7880,9 @@ define void @v_shuffle_v4i32_v4i32__2_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 ; GFX900-NEXT: v_mov_b32_e32 v5, v4 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8286,16 +7890,15 @@ define void @v_shuffle_v4i32_v4i32__2_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8303,16 +7906,15 @@ define void @v_shuffle_v4i32_v4i32__2_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -8330,13 +7932,12 @@ define void @v_shuffle_v4i32_v4i32__3_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v5 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8433,11 +8034,11 @@ define void @v_shuffle_v4i32_v4i32__5_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8482,11 +8083,11 @@ define void @v_shuffle_v4i32_v4i32__6_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8496,11 +8097,11 @@ define void @v_shuffle_v4i32_v4i32__6_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8510,11 +8111,11 @@ define void @v_shuffle_v4i32_v4i32__6_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -8528,14 +8129,14 @@ define void @v_shuffle_v4i32_v4i32__7_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8577,14 +8178,13 @@ define void @v_shuffle_v4i32_v4i32__7_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_u_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8592,13 +8192,12 @@ define void @v_shuffle_v4i32_v4i32__7_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8606,13 +8205,12 @@ define void @v_shuffle_v4i32_v4i32__7_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -8627,17 +8225,15 @@ define void @v_shuffle_v4i32_v4i32__7_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8685,15 +8281,15 @@ define void @v_shuffle_v4i32_v4i32__7_1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8741,17 +8337,15 @@ define void @v_shuffle_v4i32_v4i32__7_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8759,15 +8353,14 @@ define void @v_shuffle_v4i32_v4i32__7_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[6:7] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8776,16 +8369,15 @@ define void @v_shuffle_v4i32_v4i32__7_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[6:7] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -8801,17 +8393,15 @@ define void @v_shuffle_v4i32_v4i32__7_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8821,15 +8411,13 @@ define void @v_shuffle_v4i32_v4i32__7_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8839,15 +8427,13 @@ define void @v_shuffle_v4i32_v4i32__7_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -8861,15 +8447,14 @@ define void @v_shuffle_v4i32_v4i32__7_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_4_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8877,13 +8462,12 @@ define void @v_shuffle_v4i32_v4i32__7_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8891,13 +8475,12 @@ define void @v_shuffle_v4i32_v4i32__7_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -8926,13 +8509,12 @@ define void @v_shuffle_v4i32_v4i32__7_6_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8940,13 +8522,12 @@ define void @v_shuffle_v4i32_v4i32__7_6_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -8960,45 +8541,42 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -9012,13 +8590,13 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9059,16 +8637,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v5, v0 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9076,17 +8653,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9094,17 +8669,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -9119,16 +8692,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: v_mov_b32_e32 v6, v1 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9136,17 +8708,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v1 -; GFX90A-NEXT: v_mov_b32_e32 v11, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9154,17 +8725,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -9179,16 +8749,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9196,16 +8765,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_2_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9213,17 +8781,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_2_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -9238,17 +8805,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9256,17 +8821,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9274,17 +8838,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -9298,45 +8861,42 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -9350,46 +8910,42 @@ define void @v_shuffle_v4i32_v4i32__7_7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_6_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_6_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_6_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -9406,10 +8962,10 @@ define void @v_shuffle_v4i32_v4i32__u_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9514,10 +9070,9 @@ define void @v_shuffle_v4i32_v4i32__1_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v5, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9571,10 +9126,9 @@ define void @v_shuffle_v4i32_v4i32__2_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: v_mov_b32_e32 v6, v5 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9582,16 +9136,16 @@ define void @v_shuffle_v4i32_v4i32__2_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 ; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: v_mov_b32_e32 v7, v6 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9599,16 +9153,16 @@ define void @v_shuffle_v4i32_v4i32__2_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v6 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -9626,13 +9180,12 @@ define void @v_shuffle_v4i32_v4i32__3_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v6 -; GFX900-NEXT: v_mov_b32_e32 v7, v6 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9728,11 +9281,10 @@ define void @v_shuffle_v4i32_v4i32__5_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9775,11 +9327,11 @@ define void @v_shuffle_v4i32_v4i32__6_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9789,11 +9341,11 @@ define void @v_shuffle_v4i32_v4i32__6_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9803,11 +9355,11 @@ define void @v_shuffle_v4i32_v4i32__6_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -9821,14 +9373,14 @@ define void @v_shuffle_v4i32_v4i32__7_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_6_6_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9868,13 +9420,13 @@ define void @v_shuffle_v4i32_v4i32__7_u_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_u_6_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9915,16 +9467,15 @@ define void @v_shuffle_v4i32_v4i32__7_0_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9972,16 +9523,15 @@ define void @v_shuffle_v4i32_v4i32__7_1_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10031,15 +9581,14 @@ define void @v_shuffle_v4i32_v4i32__7_2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 ; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -10088,16 +9637,15 @@ define void @v_shuffle_v4i32_v4i32__7_3_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v6 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10105,16 +9653,16 @@ define void @v_shuffle_v4i32_v4i32__7_3_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v6 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10122,16 +9670,16 @@ define void @v_shuffle_v4i32_v4i32__7_3_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v7 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v6 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -10192,13 +9740,14 @@ define void @v_shuffle_v4i32_v4i32__7_5_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_5_6_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10238,14 +9787,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_6_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10287,14 +9836,13 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_u_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10337,17 +9885,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, v4 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v0 -; GFX900-NEXT: v_mov_b32_e32 v8, v3 -; GFX900-NEXT: global_store_dwordx4 v9, v[5:8], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10355,17 +9901,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v4 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10373,17 +9918,17 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v4 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: v_mov_b32_e32 v1, v7 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -10398,17 +9943,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v5 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v8, v1 -; GFX900-NEXT: v_mov_b32_e32 v9, v4 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10416,16 +9959,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10433,17 +9975,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -10458,16 +9998,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_2_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10517,17 +10056,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10535,16 +10072,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[6:7] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10552,17 +10088,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[6:7] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -10576,46 +10111,42 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_4_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v2 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -10629,15 +10160,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_5_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v1 -; GFX900-NEXT: v_mov_b32_e32 v7, v2 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10677,39 +10207,40 @@ define void @v_shuffle_v4i32_v4i32__u_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__u_7_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__u_7_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__u_7_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -10789,10 +10320,10 @@ define void @v_shuffle_v4i32_v4i32__1_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10800,16 +10331,16 @@ define void @v_shuffle_v4i32_v4i32__1_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10817,17 +10348,16 @@ define void @v_shuffle_v4i32_v4i32__1_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -10848,10 +10378,10 @@ define void @v_shuffle_v4i32_v4i32__2_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v6 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10859,16 +10389,16 @@ define void @v_shuffle_v4i32_v4i32__2_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 ; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10876,16 +10406,16 @@ define void @v_shuffle_v4i32_v4i32__2_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v7 +; GFX942-NEXT: v_mov_b32_e32 v4, v7 ; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -10903,13 +10433,13 @@ define void @v_shuffle_v4i32_v4i32__3_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10917,16 +10447,16 @@ define void @v_shuffle_v4i32_v4i32__3_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 ; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10934,16 +10464,17 @@ define void @v_shuffle_v4i32_v4i32__3_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, v7 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 ; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -11006,11 +10537,10 @@ define void @v_shuffle_v4i32_v4i32__5_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11020,11 +10550,11 @@ define void @v_shuffle_v4i32_v4i32__5_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11034,11 +10564,11 @@ define void @v_shuffle_v4i32_v4i32__5_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -11052,42 +10582,40 @@ define void @v_shuffle_v4i32_v4i32__6_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__6_7_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__6_7_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__6_7_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -11101,13 +10629,13 @@ define void @v_shuffle_v4i32_v4i32__7_u_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_u_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11148,16 +10676,15 @@ define void @v_shuffle_v4i32_v4i32__7_0_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11205,16 +10732,15 @@ define void @v_shuffle_v4i32_v4i32__7_1_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11264,15 +10790,14 @@ define void @v_shuffle_v4i32_v4i32__7_2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 ; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -11321,16 +10846,15 @@ define void @v_shuffle_v4i32_v4i32__7_3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11338,16 +10862,16 @@ define void @v_shuffle_v4i32_v4i32__7_3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 ; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11355,16 +10879,16 @@ define void @v_shuffle_v4i32_v4i32__7_3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 ; GFX942-NEXT: v_mov_b32_e32 v4, v7 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v7 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -11378,15 +10902,14 @@ define void @v_shuffle_v4i32_v4i32__7_4_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_4_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11426,13 +10949,14 @@ define void @v_shuffle_v4i32_v4i32__7_5_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_5_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11472,14 +10996,14 @@ define void @v_shuffle_v4i32_v4i32__7_6_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_6_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11519,13 +11043,13 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_u_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11536,9 +11060,9 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11549,9 +11073,9 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -11566,16 +11090,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11583,16 +11106,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11600,17 +11122,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -11625,16 +11145,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11642,16 +11161,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11659,17 +11178,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -11684,16 +11202,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_2_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11701,16 +11218,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_2_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11718,17 +11234,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_2_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -11743,16 +11258,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11760,16 +11274,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11777,16 +11291,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v7 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -11800,46 +11314,42 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_4_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -11853,46 +11363,42 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_5_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v1 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_5_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -11906,13 +11412,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_6_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_6_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll index 21ec9acf6317d..79dde44bcbdec 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll @@ -58,39 +58,33 @@ define void @v_shuffle_v4i64_v2i64__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__1_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__1_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__1_u_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -114,39 +108,33 @@ define void @v_shuffle_v4i64_v2i64__3_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_u_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -160,55 +148,42 @@ define void @v_shuffle_v4i64_v2i64__3_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_0_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_0_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_0_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -222,49 +197,43 @@ define void @v_shuffle_v4i64_v2i64__3_1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_1_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_1_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -291,31 +260,27 @@ define void @v_shuffle_v4i64_v2i64__3_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -329,39 +294,40 @@ define void @v_shuffle_v4i64_v2i64__3_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_3_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -375,54 +341,54 @@ define void @v_shuffle_v4i64_v2i64__3_3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_0_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_3_0_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -436,57 +402,51 @@ define void @v_shuffle_v4i64_v2i64__3_3_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_1_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_3_1_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -500,45 +460,42 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_3_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -552,44 +509,47 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_3_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] -; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() %vec1 = call <2 x i64> asm "; def $0", "=v"() %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> @@ -601,63 +561,54 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v8, v0 -; GFX900-NEXT: v_mov_b32_e32 v9, v1 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_3_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_3_3_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -671,57 +622,55 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_3_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_3_3_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -735,54 +684,52 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_3_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_3_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -796,42 +743,43 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -846,13 +794,13 @@ define void @v_shuffle_v4i64_v2i64__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -860,13 +808,13 @@ define void @v_shuffle_v4i64_v2i64__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -874,13 +822,13 @@ define void @v_shuffle_v4i64_v2i64__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -943,53 +891,47 @@ define void @v_shuffle_v4i64_v2i64__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__1_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__1_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1003,13 +945,13 @@ define void @v_shuffle_v4i64_v2i64__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1017,13 +959,13 @@ define void @v_shuffle_v4i64_v2i64__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1031,13 +973,13 @@ define void @v_shuffle_v4i64_v2i64__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1051,21 +993,16 @@ define void @v_shuffle_v4i64_v2i64__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1073,21 +1010,16 @@ define void @v_shuffle_v4i64_v2i64__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1095,22 +1027,16 @@ define void @v_shuffle_v4i64_v2i64__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1125,19 +1051,16 @@ define void @v_shuffle_v4i64_v2i64__3_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1145,19 +1068,16 @@ define void @v_shuffle_v4i64_v2i64__3_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1165,20 +1085,16 @@ define void @v_shuffle_v4i64_v2i64__3_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1192,63 +1108,58 @@ define void @v_shuffle_v4i64_v2i64__3_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_1_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v0 -; GFX900-NEXT: v_mov_b32_e32 v9, v1 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: v_mov_b32_e32 v12, v0 -; GFX90A-NEXT: v_mov_b32_e32 v13, v1 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v12, v0 -; GFX942-NEXT: v_mov_b32_e32 v13, v1 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1263,21 +1174,19 @@ define void @v_shuffle_v4i64_v2i64__3_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1285,21 +1194,19 @@ define void @v_shuffle_v4i64_v2i64__3_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1307,21 +1214,19 @@ define void @v_shuffle_v4i64_v2i64__3_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1336,19 +1241,19 @@ define void @v_shuffle_v4i64_v2i64__3_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1356,19 +1261,19 @@ define void @v_shuffle_v4i64_v2i64__3_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1376,20 +1281,19 @@ define void @v_shuffle_v4i64_v2i64__3_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1404,19 +1308,16 @@ define void @v_shuffle_v4i64_v2i64__3_3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1424,19 +1325,16 @@ define void @v_shuffle_v4i64_v2i64__3_3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1444,20 +1342,16 @@ define void @v_shuffle_v4i64_v2i64__3_3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1474,17 +1368,16 @@ define void @v_shuffle_v4i64_v2i64__3_3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1494,17 +1387,16 @@ define void @v_shuffle_v4i64_v2i64__3_3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1514,18 +1406,16 @@ define void @v_shuffle_v4i64_v2i64__3_3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1540,20 +1430,18 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: v_mov_b32_e32 v8, v0 -; GFX900-NEXT: v_mov_b32_e32 v9, v1 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1561,20 +1449,18 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1582,20 +1468,19 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1609,42 +1494,43 @@ define void @v_shuffle_v4i64_v2i64__u_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__u_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__u_1_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__u_1_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1657,49 +1543,43 @@ define void @v_shuffle_v4i64_v2i64__0_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__0_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__0_1_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__0_1_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1712,42 +1592,43 @@ define void @v_shuffle_v4i64_v2i64__1_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__1_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__1_1_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__1_1_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1760,42 +1641,43 @@ define void @v_shuffle_v4i64_v2i64__2_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__2_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__2_1_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__2_1_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1809,19 +1691,16 @@ define void @v_shuffle_v4i64_v2i64__3_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1829,19 +1708,16 @@ define void @v_shuffle_v4i64_v2i64__3_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1849,19 +1725,16 @@ define void @v_shuffle_v4i64_v2i64__3_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1876,19 +1749,16 @@ define void @v_shuffle_v4i64_v2i64__3_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1896,19 +1766,16 @@ define void @v_shuffle_v4i64_v2i64__3_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1916,19 +1783,17 @@ define void @v_shuffle_v4i64_v2i64__3_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1942,69 +1807,52 @@ define void @v_shuffle_v4i64_v2i64__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_0_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v2 -; GFX900-NEXT: v_mov_b32_e32 v9, v3 -; GFX900-NEXT: v_mov_b32_e32 v10, v2 -; GFX900-NEXT: v_mov_b32_e32 v11, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_0_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v2 -; GFX90A-NEXT: v_mov_b32_e32 v11, v3 -; GFX90A-NEXT: v_mov_b32_e32 v12, v2 -; GFX90A-NEXT: v_mov_b32_e32 v13, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_0_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v2 -; GFX942-NEXT: v_mov_b32_e32 v11, v3 -; GFX942-NEXT: v_mov_b32_e32 v12, v2 -; GFX942-NEXT: v_mov_b32_e32 v13, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2018,66 +1866,58 @@ define void @v_shuffle_v4i64_v2i64__3_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_2_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_2_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v6 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_2_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v10, v6 +; GFX942-NEXT: v_mov_b32_e32 v11, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2092,18 +1932,18 @@ define void @v_shuffle_v4i64_v2i64__3_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2111,18 +1951,18 @@ define void @v_shuffle_v4i64_v2i64__3_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2130,18 +1970,18 @@ define void @v_shuffle_v4i64_v2i64__3_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2155,51 +1995,55 @@ define void @v_shuffle_v4i64_v2i64__3_3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_u_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_3_u_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2213,51 +2057,54 @@ define void @v_shuffle_v4i64_v2i64__3_3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_0_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_3_0_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2272,18 +2119,19 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2291,18 +2139,19 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2310,18 +2159,20 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2385,39 +2236,33 @@ define void @v_shuffle_v4i64_v2i64__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__1_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__1_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__1_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2443,53 +2288,47 @@ define void @v_shuffle_v4i64_v2i64__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2503,54 +2342,43 @@ define void @v_shuffle_v4i64_v2i64__3_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_u_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_u_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_u_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2564,70 +2392,52 @@ define void @v_shuffle_v4i64_v2i64__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_0_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: v_mov_b32_e32 v8, v2 -; GFX900-NEXT: v_mov_b32_e32 v9, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_0_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: v_mov_b32_e32 v10, v2 -; GFX90A-NEXT: v_mov_b32_e32 v11, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_0_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: v_mov_b32_e32 v10, v2 -; GFX942-NEXT: v_mov_b32_e32 v11, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2642,20 +2452,16 @@ define void @v_shuffle_v4i64_v2i64__3_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 -; GFX900-NEXT: v_mov_b32_e32 v10, v4 -; GFX900-NEXT: v_mov_b32_e32 v11, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2663,20 +2469,16 @@ define void @v_shuffle_v4i64_v2i64__3_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, v4 -; GFX90A-NEXT: v_mov_b32_e32 v11, v5 -; GFX90A-NEXT: v_mov_b32_e32 v12, v4 -; GFX90A-NEXT: v_mov_b32_e32 v13, v5 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2684,21 +2486,17 @@ define void @v_shuffle_v4i64_v2i64__3_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v4 -; GFX942-NEXT: v_mov_b32_e32 v11, v5 -; GFX942-NEXT: v_mov_b32_e32 v12, v4 -; GFX942-NEXT: v_mov_b32_e32 v13, v5 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2712,54 +2510,49 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_3_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2773,48 +2566,42 @@ define void @v_shuffle_v4i64_v2i64__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_3_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v0, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2831,15 +2618,14 @@ define void @v_shuffle_v4i64_v2i64__3_3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2849,15 +2635,14 @@ define void @v_shuffle_v4i64_v2i64__3_3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2867,15 +2652,14 @@ define void @v_shuffle_v4i64_v2i64__3_3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2889,63 +2673,51 @@ define void @v_shuffle_v4i64_v2i64__3_3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_1_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_3_1_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2959,42 +2731,43 @@ define void @v_shuffle_v4i64_v2i64__u_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__u_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__u_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__u_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -3009,16 +2782,18 @@ define void @v_shuffle_v4i64_v2i64__0_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3026,16 +2801,18 @@ define void @v_shuffle_v4i64_v2i64__0_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3043,17 +2820,19 @@ define void @v_shuffle_v4i64_v2i64__0_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -3070,17 +2849,16 @@ define void @v_shuffle_v4i64_v2i64__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3090,17 +2868,16 @@ define void @v_shuffle_v4i64_v2i64__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: v_mov_b32_e32 v4, v6 ; GFX90A-NEXT: v_mov_b32_e32 v5, v7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3110,17 +2887,16 @@ define void @v_shuffle_v4i64_v2i64__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: v_mov_b32_e32 v5, v7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -3134,49 +2910,43 @@ define void @v_shuffle_v4i64_v2i64__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__2_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__2_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__2_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -3190,42 +2960,43 @@ define void @v_shuffle_v4i64_v2i64__3_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_u_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_u_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_u_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -3239,61 +3010,61 @@ define void @v_shuffle_v4i64_v2i64__3_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_0_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_0_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_0_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -3308,18 +3079,19 @@ define void @v_shuffle_v4i64_v2i64__3_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3327,18 +3099,19 @@ define void @v_shuffle_v4i64_v2i64__3_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v6 ; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3346,18 +3119,20 @@ define void @v_shuffle_v4i64_v2i64__3_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -3371,58 +3146,52 @@ define void @v_shuffle_v4i64_v2i64__3_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_2_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -3436,45 +3205,42 @@ define void @v_shuffle_v4i64_v2i64__3_3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_3_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -3489,16 +3255,19 @@ define void @v_shuffle_v4i64_v2i64__3_3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3506,16 +3275,19 @@ define void @v_shuffle_v4i64_v2i64__3_3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3523,17 +3295,20 @@ define void @v_shuffle_v4i64_v2i64__3_3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -3550,17 +3325,16 @@ define void @v_shuffle_v4i64_v2i64__3_3_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3570,17 +3344,16 @@ define void @v_shuffle_v4i64_v2i64__3_3_1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3590,18 +3363,16 @@ define void @v_shuffle_v4i64_v2i64__3_3_1_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v7 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -3615,45 +3386,42 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_3_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll index 615b382aa355a..97a9a0f94944d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll @@ -100,39 +100,33 @@ define void @v_shuffle_v4i64_v3i64__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__2_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__2_u_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -196,39 +190,33 @@ define void @v_shuffle_v4i64_v3i64__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_u_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -242,55 +230,42 @@ define void @v_shuffle_v4i64_v3i64__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_0_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_0_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_0_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -304,49 +279,43 @@ define void @v_shuffle_v4i64_v3i64__5_1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_1_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_1_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -360,49 +329,43 @@ define void @v_shuffle_v4i64_v3i64__5_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_2_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -416,45 +379,40 @@ define void @v_shuffle_v4i64_v3i64__5_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_3_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_3_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -468,39 +426,40 @@ define void @v_shuffle_v4i64_v3i64__5_4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_4_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_4_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_4_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -514,39 +473,40 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_5_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -560,51 +520,54 @@ define void @v_shuffle_v4i64_v3i64__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_0_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_5_0_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -618,51 +581,54 @@ define void @v_shuffle_v4i64_v3i64__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_1_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_5_1_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -676,57 +642,51 @@ define void @v_shuffle_v4i64_v3i64__5_5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_5_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -740,45 +700,42 @@ define void @v_shuffle_v4i64_v3i64__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -792,45 +749,42 @@ define void @v_shuffle_v4i64_v3i64__5_5_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_5_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -844,51 +798,45 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_5_5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -902,67 +850,54 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_5_5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -976,58 +911,55 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_5_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_5_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_5_5_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1041,57 +973,55 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_5_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_5_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_5_5_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1105,54 +1035,52 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_5_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v8, v0 -; GFX900-NEXT: v_mov_b32_e32 v9, v1 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_5_5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1166,51 +1094,52 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_5_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1224,42 +1153,43 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1274,13 +1204,13 @@ define void @v_shuffle_v4i64_v3i64__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1288,13 +1218,13 @@ define void @v_shuffle_v4i64_v3i64__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1302,13 +1232,13 @@ define void @v_shuffle_v4i64_v3i64__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1370,15 +1300,15 @@ define void @v_shuffle_v4i64_v3i64__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1386,17 +1316,15 @@ define void @v_shuffle_v4i64_v3i64__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1404,17 +1332,15 @@ define void @v_shuffle_v4i64_v3i64__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1430,16 +1356,13 @@ define void @v_shuffle_v4i64_v3i64__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1449,16 +1372,13 @@ define void @v_shuffle_v4i64_v3i64__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1468,16 +1388,13 @@ define void @v_shuffle_v4i64_v3i64__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1491,13 +1408,13 @@ define void @v_shuffle_v4i64_v3i64__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1505,13 +1422,13 @@ define void @v_shuffle_v4i64_v3i64__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1519,13 +1436,13 @@ define void @v_shuffle_v4i64_v3i64__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1604,20 +1521,16 @@ define void @v_shuffle_v4i64_v3i64__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1625,20 +1538,16 @@ define void @v_shuffle_v4i64_v3i64__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1646,21 +1555,16 @@ define void @v_shuffle_v4i64_v3i64__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1675,19 +1579,16 @@ define void @v_shuffle_v4i64_v3i64__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1695,19 +1596,16 @@ define void @v_shuffle_v4i64_v3i64__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1715,20 +1613,16 @@ define void @v_shuffle_v4i64_v3i64__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1742,64 +1636,58 @@ define void @v_shuffle_v4i64_v3i64__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_1_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v12, v4 +; GFX900-NEXT: v_mov_b32_e32 v13, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v12, v4 +; GFX90A-NEXT: v_mov_b32_e32 v13, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v12, v4 +; GFX942-NEXT: v_mov_b32_e32 v13, v5 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1814,19 +1702,19 @@ define void @v_shuffle_v4i64_v3i64__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1834,19 +1722,19 @@ define void @v_shuffle_v4i64_v3i64__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1854,19 +1742,19 @@ define void @v_shuffle_v4i64_v3i64__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v10 +; GFX942-NEXT: v_mov_b32_e32 v7, v11 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1881,21 +1769,19 @@ define void @v_shuffle_v4i64_v3i64__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1903,21 +1789,19 @@ define void @v_shuffle_v4i64_v3i64__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1925,21 +1809,19 @@ define void @v_shuffle_v4i64_v3i64__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1954,19 +1836,19 @@ define void @v_shuffle_v4i64_v3i64__5_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1974,19 +1856,19 @@ define void @v_shuffle_v4i64_v3i64__5_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1994,20 +1876,19 @@ define void @v_shuffle_v4i64_v3i64__5_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -2022,18 +1903,19 @@ define void @v_shuffle_v4i64_v3i64__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2041,18 +1923,19 @@ define void @v_shuffle_v4i64_v3i64__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2060,19 +1943,19 @@ define void @v_shuffle_v4i64_v3i64__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -2087,18 +1970,16 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2106,18 +1987,16 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2125,19 +2004,16 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -2152,18 +2028,18 @@ define void @v_shuffle_v4i64_v3i64__5_5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2171,18 +2047,18 @@ define void @v_shuffle_v4i64_v3i64__5_5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v6 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2190,19 +2066,18 @@ define void @v_shuffle_v4i64_v3i64__5_5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v6 +; GFX942-NEXT: v_mov_b32_e32 v11, v7 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -2217,20 +2092,18 @@ define void @v_shuffle_v4i64_v3i64__5_5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2238,20 +2111,18 @@ define void @v_shuffle_v4i64_v3i64__5_5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2259,20 +2130,18 @@ define void @v_shuffle_v4i64_v3i64__5_5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 +; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -2287,19 +2156,18 @@ define void @v_shuffle_v4i64_v3i64__5_5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2307,19 +2175,18 @@ define void @v_shuffle_v4i64_v3i64__5_5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2327,20 +2194,18 @@ define void @v_shuffle_v4i64_v3i64__5_5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v7 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -2355,21 +2220,18 @@ define void @v_shuffle_v4i64_v3i64__5_5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2377,21 +2239,18 @@ define void @v_shuffle_v4i64_v3i64__5_5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2399,22 +2258,19 @@ define void @v_shuffle_v4i64_v3i64__5_5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v7 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -2575,14 +2431,13 @@ define void @v_shuffle_v4i64_v3i64__2_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2592,14 +2447,13 @@ define void @v_shuffle_v4i64_v3i64__2_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2609,14 +2463,13 @@ define void @v_shuffle_v4i64_v3i64__2_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -2743,18 +2596,16 @@ define void @v_shuffle_v4i64_v3i64__5_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2762,18 +2613,16 @@ define void @v_shuffle_v4i64_v3i64__5_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2781,19 +2630,16 @@ define void @v_shuffle_v4i64_v3i64__5_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -2808,18 +2654,16 @@ define void @v_shuffle_v4i64_v3i64__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2827,18 +2671,16 @@ define void @v_shuffle_v4i64_v3i64__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2846,19 +2688,16 @@ define void @v_shuffle_v4i64_v3i64__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -2873,21 +2712,16 @@ define void @v_shuffle_v4i64_v3i64__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2895,21 +2729,16 @@ define void @v_shuffle_v4i64_v3i64__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2917,22 +2746,16 @@ define void @v_shuffle_v4i64_v3i64__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -2947,19 +2770,19 @@ define void @v_shuffle_v4i64_v3i64__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2967,19 +2790,19 @@ define void @v_shuffle_v4i64_v3i64__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2987,19 +2810,19 @@ define void @v_shuffle_v4i64_v3i64__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v8 +; GFX942-NEXT: v_mov_b32_e32 v7, v9 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v10 +; GFX942-NEXT: v_mov_b32_e32 v7, v11 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3014,20 +2837,19 @@ define void @v_shuffle_v4i64_v3i64__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3035,20 +2857,19 @@ define void @v_shuffle_v4i64_v3i64__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3056,20 +2877,19 @@ define void @v_shuffle_v4i64_v3i64__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3084,18 +2904,18 @@ define void @v_shuffle_v4i64_v3i64__5_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v8 -; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 ; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -3104,18 +2924,18 @@ define void @v_shuffle_v4i64_v3i64__5_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v8 -; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 ; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3124,19 +2944,18 @@ define void @v_shuffle_v4i64_v3i64__5_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v8 -; GFX942-NEXT: v_mov_b32_e32 v5, v9 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 ; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -3152,18 +2971,19 @@ define void @v_shuffle_v4i64_v3i64__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3171,18 +2991,19 @@ define void @v_shuffle_v4i64_v3i64__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3190,19 +3011,19 @@ define void @v_shuffle_v4i64_v3i64__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3216,51 +3037,55 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_u_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_5_u_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3274,51 +3099,54 @@ define void @v_shuffle_v4i64_v3i64__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_0_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_5_0_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3333,18 +3161,18 @@ define void @v_shuffle_v4i64_v3i64__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3352,18 +3180,18 @@ define void @v_shuffle_v4i64_v3i64__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3371,18 +3199,18 @@ define void @v_shuffle_v4i64_v3i64__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 +; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3397,39 +3225,37 @@ define void @v_shuffle_v4i64_v3i64__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_3_1: +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_3_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3437,20 +3263,19 @@ define void @v_shuffle_v4i64_v3i64__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v7 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3465,18 +3290,19 @@ define void @v_shuffle_v4i64_v3i64__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3484,18 +3310,19 @@ define void @v_shuffle_v4i64_v3i64__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3503,19 +3330,20 @@ define void @v_shuffle_v4i64_v3i64__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3529,42 +3357,43 @@ define void @v_shuffle_v4i64_v3i64__u_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__u_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__u_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__u_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3580,11 +3409,13 @@ define void @v_shuffle_v4i64_v3i64__0_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3594,11 +3425,13 @@ define void @v_shuffle_v4i64_v3i64__0_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3608,11 +3441,13 @@ define void @v_shuffle_v4i64_v3i64__0_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3625,49 +3460,43 @@ define void @v_shuffle_v4i64_v3i64__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__1_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__1_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v4 -; GFX90A-NEXT: v_mov_b32_e32 v11, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__1_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v4 -; GFX942-NEXT: v_mov_b32_e32 v11, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3680,42 +3509,43 @@ define void @v_shuffle_v4i64_v3i64__2_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__2_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__2_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__2_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3728,42 +3558,43 @@ define void @v_shuffle_v4i64_v3i64__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__3_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__3_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__3_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3777,17 +3608,17 @@ define void @v_shuffle_v4i64_v3i64__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 ; GFX900-NEXT: v_mov_b32_e32 v10, v4 ; GFX900-NEXT: v_mov_b32_e32 v11, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -3796,17 +3627,17 @@ define void @v_shuffle_v4i64_v3i64__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: v_mov_b32_e32 v10, v4 ; GFX90A-NEXT: v_mov_b32_e32 v11, v5 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3815,17 +3646,18 @@ define void @v_shuffle_v4i64_v3i64__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 ; GFX942-NEXT: v_mov_b32_e32 v10, v4 ; GFX942-NEXT: v_mov_b32_e32 v11, v5 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -3841,19 +3673,16 @@ define void @v_shuffle_v4i64_v3i64__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3861,19 +3690,16 @@ define void @v_shuffle_v4i64_v3i64__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3881,19 +3707,16 @@ define void @v_shuffle_v4i64_v3i64__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3908,18 +3731,16 @@ define void @v_shuffle_v4i64_v3i64__5_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3927,18 +3748,16 @@ define void @v_shuffle_v4i64_v3i64__5_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3946,18 +3765,17 @@ define void @v_shuffle_v4i64_v3i64__5_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -3971,66 +3789,52 @@ define void @v_shuffle_v4i64_v3i64__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_0_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_0_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_0_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -4044,63 +3848,52 @@ define void @v_shuffle_v4i64_v3i64__5_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_1_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_1_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -4114,63 +3907,58 @@ define void @v_shuffle_v4i64_v3i64__5_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[8:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v14, v8 +; GFX900-NEXT: v_mov_b32_e32 v15, v9 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_3_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[8:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v14, v8 +; GFX90A-NEXT: v_mov_b32_e32 v15, v9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_3_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[8:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_mov_b32_e32 v14, v8 +; GFX942-NEXT: v_mov_b32_e32 v15, v9 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -4185,18 +3973,18 @@ define void @v_shuffle_v4i64_v3i64__5_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v6, v10 -; GFX900-NEXT: v_mov_b32_e32 v7, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v12, v8 +; GFX900-NEXT: v_mov_b32_e32 v13, v9 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4204,18 +3992,18 @@ define void @v_shuffle_v4i64_v3i64__5_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v10 -; GFX90A-NEXT: v_mov_b32_e32 v7, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v12, v8 +; GFX90A-NEXT: v_mov_b32_e32 v13, v9 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4223,18 +4011,18 @@ define void @v_shuffle_v4i64_v3i64__5_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v6, v10 -; GFX942-NEXT: v_mov_b32_e32 v7, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v12, v8 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_mov_b32_e32 v13, v9 +; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -4249,18 +4037,18 @@ define void @v_shuffle_v4i64_v3i64__5_5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4268,18 +4056,18 @@ define void @v_shuffle_v4i64_v3i64__5_5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4287,18 +4075,18 @@ define void @v_shuffle_v4i64_v3i64__5_5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 +; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -4312,51 +4100,55 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_5_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -4371,18 +4163,19 @@ define void @v_shuffle_v4i64_v3i64__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: v_mov_b32_e32 v8, v10 ; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4390,18 +4183,19 @@ define void @v_shuffle_v4i64_v3i64__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: v_mov_b32_e32 v8, v10 ; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4409,18 +4203,19 @@ define void @v_shuffle_v4i64_v3i64__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 ; GFX942-NEXT: v_mov_b32_e32 v8, v10 ; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -4434,51 +4229,54 @@ define void @v_shuffle_v4i64_v3i64__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_1_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_5_1_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -4493,19 +4291,18 @@ define void @v_shuffle_v4i64_v3i64__5_5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4513,19 +4310,18 @@ define void @v_shuffle_v4i64_v3i64__5_5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4533,20 +4329,19 @@ define void @v_shuffle_v4i64_v3i64__5_5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v7 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -4561,18 +4356,19 @@ define void @v_shuffle_v4i64_v3i64__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4580,18 +4376,19 @@ define void @v_shuffle_v4i64_v3i64__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4599,18 +4396,20 @@ define void @v_shuffle_v4i64_v3i64__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -4716,39 +4515,33 @@ define void @v_shuffle_v4i64_v3i64__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__2_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__2_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__2_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -4773,15 +4566,15 @@ define void @v_shuffle_v4i64_v3i64__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4789,17 +4582,15 @@ define void @v_shuffle_v4i64_v3i64__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4807,17 +4598,15 @@ define void @v_shuffle_v4i64_v3i64__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -4834,16 +4623,13 @@ define void @v_shuffle_v4i64_v3i64__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4853,16 +4639,13 @@ define void @v_shuffle_v4i64_v3i64__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4872,16 +4655,13 @@ define void @v_shuffle_v4i64_v3i64__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -4902,10 +4682,7 @@ define void @v_shuffle_v4i64_v3i64__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4919,10 +4696,7 @@ define void @v_shuffle_v4i64_v3i64__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4936,10 +4710,7 @@ define void @v_shuffle_v4i64_v3i64__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -4956,19 +4727,14 @@ define void @v_shuffle_v4i64_v3i64__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4978,19 +4744,14 @@ define void @v_shuffle_v4i64_v3i64__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5000,20 +4761,14 @@ define void @v_shuffle_v4i64_v3i64__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -5028,18 +4783,16 @@ define void @v_shuffle_v4i64_v3i64__5_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5047,18 +4800,16 @@ define void @v_shuffle_v4i64_v3i64__5_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5066,19 +4817,17 @@ define void @v_shuffle_v4i64_v3i64__5_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -5093,18 +4842,16 @@ define void @v_shuffle_v4i64_v3i64__5_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5112,18 +4859,16 @@ define void @v_shuffle_v4i64_v3i64__5_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v6 -; GFX90A-NEXT: v_mov_b32_e32 v9, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5131,18 +4876,17 @@ define void @v_shuffle_v4i64_v3i64__5_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v6 -; GFX942-NEXT: v_mov_b32_e32 v9, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -5156,54 +4900,49 @@ define void @v_shuffle_v4i64_v3i64__5_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_4_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: v_mov_b32_e32 v8, v0 -; GFX900-NEXT: v_mov_b32_e32 v9, v1 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -5220,14 +4959,13 @@ define void @v_shuffle_v4i64_v3i64__5_5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5237,14 +4975,13 @@ define void @v_shuffle_v4i64_v3i64__5_5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5254,14 +4991,13 @@ define void @v_shuffle_v4i64_v3i64__5_5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -5275,51 +5011,42 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_5_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -5336,14 +5063,14 @@ define void @v_shuffle_v4i64_v3i64__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5353,14 +5080,14 @@ define void @v_shuffle_v4i64_v3i64__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5370,14 +5097,14 @@ define void @v_shuffle_v4i64_v3i64__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -5394,14 +5121,14 @@ define void @v_shuffle_v4i64_v3i64__5_5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5411,14 +5138,14 @@ define void @v_shuffle_v4i64_v3i64__5_5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5428,14 +5155,14 @@ define void @v_shuffle_v4i64_v3i64__5_5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -5449,63 +5176,51 @@ define void @v_shuffle_v4i64_v3i64__5_5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_5_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -5519,54 +5234,49 @@ define void @v_shuffle_v4i64_v3i64__5_5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_4_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: v_mov_b32_e32 v8, v0 -; GFX900-NEXT: v_mov_b32_e32 v9, v1 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_5_4_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -5763,15 +5473,15 @@ define void @v_shuffle_v4i64_v3i64__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v8 -; GFX900-NEXT: v_mov_b32_e32 v11, v9 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 ; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5782,15 +5492,15 @@ define void @v_shuffle_v4i64_v3i64__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, v8 -; GFX90A-NEXT: v_mov_b32_e32 v11, v9 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5801,15 +5511,15 @@ define void @v_shuffle_v4i64_v3i64__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: v_mov_b32_e32 v6, v8 +; GFX942-NEXT: v_mov_b32_e32 v7, v9 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v8 -; GFX942-NEXT: v_mov_b32_e32 v11, v9 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 ; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -5924,14 +5634,13 @@ define void @v_shuffle_v4i64_v3i64__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5941,14 +5650,13 @@ define void @v_shuffle_v4i64_v3i64__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5958,14 +5666,13 @@ define void @v_shuffle_v4i64_v3i64__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -5986,10 +5693,7 @@ define void @v_shuffle_v4i64_v3i64__5_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6003,10 +5707,7 @@ define void @v_shuffle_v4i64_v3i64__5_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6020,10 +5721,7 @@ define void @v_shuffle_v4i64_v3i64__5_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -6040,19 +5738,14 @@ define void @v_shuffle_v4i64_v3i64__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6062,19 +5755,14 @@ define void @v_shuffle_v4i64_v3i64__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6084,20 +5772,14 @@ define void @v_shuffle_v4i64_v3i64__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -6112,18 +5794,16 @@ define void @v_shuffle_v4i64_v3i64__5_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6131,18 +5811,16 @@ define void @v_shuffle_v4i64_v3i64__5_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6150,19 +5828,17 @@ define void @v_shuffle_v4i64_v3i64__5_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -6177,18 +5853,16 @@ define void @v_shuffle_v4i64_v3i64__5_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6196,18 +5870,16 @@ define void @v_shuffle_v4i64_v3i64__5_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6215,18 +5887,17 @@ define void @v_shuffle_v4i64_v3i64__5_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -6240,60 +5911,56 @@ define void @v_shuffle_v4i64_v3i64__5_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_3_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v6, v2 ; GFX900-NEXT: v_mov_b32_e32 v7, v3 ; GFX900-NEXT: v_mov_b32_e32 v8, v2 ; GFX900-NEXT: v_mov_b32_e32 v9, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 ; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_3_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 ; GFX90A-NEXT: v_mov_b32_e32 v10, v2 ; GFX90A-NEXT: v_mov_b32_e32 v11, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v12, v2 +; GFX90A-NEXT: v_mov_b32_e32 v13, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_3_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v10, v2 ; GFX942-NEXT: v_mov_b32_e32 v11, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v12, v2 +; GFX942-NEXT: v_mov_b32_e32 v13, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -6310,14 +5977,13 @@ define void @v_shuffle_v4i64_v3i64__5_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6327,14 +5993,13 @@ define void @v_shuffle_v4i64_v3i64__5_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6344,14 +6009,13 @@ define void @v_shuffle_v4i64_v3i64__5_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -6365,45 +6029,42 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_5_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -6418,18 +6079,19 @@ define void @v_shuffle_v4i64_v3i64__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6437,18 +6099,19 @@ define void @v_shuffle_v4i64_v3i64__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6456,19 +6119,20 @@ define void @v_shuffle_v4i64_v3i64__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -6483,18 +6147,19 @@ define void @v_shuffle_v4i64_v3i64__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6502,18 +6167,19 @@ define void @v_shuffle_v4i64_v3i64__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6521,19 +6187,20 @@ define void @v_shuffle_v4i64_v3i64__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -6550,17 +6217,16 @@ define void @v_shuffle_v4i64_v3i64__5_5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6570,17 +6236,16 @@ define void @v_shuffle_v4i64_v3i64__5_5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6590,18 +6255,16 @@ define void @v_shuffle_v4i64_v3i64__5_5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v8 +; GFX942-NEXT: v_mov_b32_e32 v7, v9 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -6615,45 +6278,42 @@ define void @v_shuffle_v4i64_v3i64__5_5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -6667,42 +6327,43 @@ define void @v_shuffle_v4i64_v3i64__u_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__u_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__u_5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__u_5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -6717,18 +6378,18 @@ define void @v_shuffle_v4i64_v3i64__0_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[8:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6736,18 +6397,18 @@ define void @v_shuffle_v4i64_v3i64__0_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[8:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6755,19 +6416,19 @@ define void @v_shuffle_v4i64_v3i64__0_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[8:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -6782,18 +6443,18 @@ define void @v_shuffle_v4i64_v3i64__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v8 -; GFX900-NEXT: v_mov_b32_e32 v5, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6801,18 +6462,18 @@ define void @v_shuffle_v4i64_v3i64__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v8 -; GFX90A-NEXT: v_mov_b32_e32 v5, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6820,19 +6481,19 @@ define void @v_shuffle_v4i64_v3i64__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v8 -; GFX942-NEXT: v_mov_b32_e32 v5, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -6849,17 +6510,16 @@ define void @v_shuffle_v4i64_v3i64__2_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6869,17 +6529,16 @@ define void @v_shuffle_v4i64_v3i64__2_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6889,17 +6548,16 @@ define void @v_shuffle_v4i64_v3i64__2_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 +; GFX942-NEXT: v_mov_b32_e32 v6, v10 +; GFX942-NEXT: v_mov_b32_e32 v7, v11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -6916,11 +6574,13 @@ define void @v_shuffle_v4i64_v3i64__3_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6930,11 +6590,13 @@ define void @v_shuffle_v4i64_v3i64__3_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6944,11 +6606,13 @@ define void @v_shuffle_v4i64_v3i64__3_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -6962,49 +6626,43 @@ define void @v_shuffle_v4i64_v3i64__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__4_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__4_5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v4 -; GFX90A-NEXT: v_mov_b32_e32 v11, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__4_5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v4 -; GFX942-NEXT: v_mov_b32_e32 v11, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -7018,48 +6676,43 @@ define void @v_shuffle_v4i64_v3i64__5_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_u_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_u_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_u_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -7073,64 +6726,61 @@ define void @v_shuffle_v4i64_v3i64__5_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_0_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[8:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_0_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[8:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_0_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v8 +; GFX942-NEXT: v_mov_b32_e32 v7, v9 +; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -7145,18 +6795,19 @@ define void @v_shuffle_v4i64_v3i64__5_1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v6, v8 ; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7164,18 +6815,19 @@ define void @v_shuffle_v4i64_v3i64__5_1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v6, v8 ; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7183,19 +6835,20 @@ define void @v_shuffle_v4i64_v3i64__5_1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mov_b32_e32 v6, v8 ; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -7210,18 +6863,19 @@ define void @v_shuffle_v4i64_v3i64__5_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7229,18 +6883,19 @@ define void @v_shuffle_v4i64_v3i64__5_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7248,18 +6903,20 @@ define void @v_shuffle_v4i64_v3i64__5_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v8 +; GFX942-NEXT: v_mov_b32_e32 v7, v9 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -7273,51 +6930,52 @@ define void @v_shuffle_v4i64_v3i64__5_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_3_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_3_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_3_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -7331,54 +6989,52 @@ define void @v_shuffle_v4i64_v3i64__5_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_4_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_4_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v4 -; GFX90A-NEXT: v_mov_b32_e32 v11, v5 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_4_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v4 -; GFX942-NEXT: v_mov_b32_e32 v11, v5 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -7392,45 +7048,42 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_u_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_5_u_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -7445,18 +7098,19 @@ define void @v_shuffle_v4i64_v3i64__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7464,18 +7118,19 @@ define void @v_shuffle_v4i64_v3i64__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7483,19 +7138,20 @@ define void @v_shuffle_v4i64_v3i64__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -7510,18 +7166,19 @@ define void @v_shuffle_v4i64_v3i64__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v8 -; GFX900-NEXT: v_mov_b32_e32 v5, v9 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7529,18 +7186,19 @@ define void @v_shuffle_v4i64_v3i64__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v8 -; GFX90A-NEXT: v_mov_b32_e32 v5, v9 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7548,19 +7206,20 @@ define void @v_shuffle_v4i64_v3i64__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v8 -; GFX942-NEXT: v_mov_b32_e32 v5, v9 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -7577,17 +7236,16 @@ define void @v_shuffle_v4i64_v3i64__5_5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7597,17 +7255,16 @@ define void @v_shuffle_v4i64_v3i64__5_5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7617,18 +7274,16 @@ define void @v_shuffle_v4i64_v3i64__5_5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v10 +; GFX942-NEXT: v_mov_b32_e32 v7, v11 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -7645,11 +7300,13 @@ define void @v_shuffle_v4i64_v3i64__5_5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7659,11 +7316,13 @@ define void @v_shuffle_v4i64_v3i64__5_5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7673,11 +7332,13 @@ define void @v_shuffle_v4i64_v3i64__5_5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -7691,45 +7352,42 @@ define void @v_shuffle_v4i64_v3i64__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__5_5_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -7865,10 +7523,9 @@ define void @s_shuffle_v4i64_v3i64__2_u_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -7966,10 +7623,9 @@ define void @s_shuffle_v4i64_v3i64__5_u_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -7986,15 +7642,13 @@ define void @s_shuffle_v4i64_v3i64__5_0_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -8004,15 +7658,13 @@ define void @s_shuffle_v4i64_v3i64__5_0_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -8075,11 +7727,11 @@ define void @s_shuffle_v4i64_v3i64__5_1_u_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -8130,13 +7782,11 @@ define void @s_shuffle_v4i64_v3i64__5_2_u_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -8149,46 +7799,18 @@ define void @s_shuffle_v4i64_v3i64__5_2_u_u() { } define void @s_shuffle_v4i64_v3i64__5_3_u_u() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_3_u_u: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_3_u_u: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_3_u_u: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_3_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> @@ -8201,10 +7823,10 @@ define void @s_shuffle_v4i64_v3i64__5_4_u_u() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND @@ -8217,50 +7839,18 @@ define void @s_shuffle_v4i64_v3i64__5_4_u_u() { } define void @s_shuffle_v4i64_v3i64__5_5_u_u() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_u_u: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_u_u: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_u_u: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> @@ -8269,65 +7859,21 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_u() { } define void @s_shuffle_v4i64_v3i64__5_5_0_u() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_0_u: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_0_u: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_0_u: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_0_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> @@ -8340,17 +7886,15 @@ define void @s_shuffle_v4i64_v3i64__5_5_1_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -8360,17 +7904,15 @@ define void @s_shuffle_v4i64_v3i64__5_5_1_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -8380,16 +7922,14 @@ define void @s_shuffle_v4i64_v3i64__5_5_1_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 ; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] @@ -8410,12 +7950,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_2_u() { ; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -8428,12 +7966,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_2_u() { ; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -8445,13 +7981,12 @@ define void @s_shuffle_v4i64_v3i64__5_5_2_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -8464,52 +7999,20 @@ define void @s_shuffle_v4i64_v3i64__5_5_2_u() { } define void @s_shuffle_v4i64_v3i64__5_5_3_u() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_3_u: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_3_u: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_3_u: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_3_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> @@ -8518,56 +8021,20 @@ define void @s_shuffle_v4i64_v3i64__5_5_3_u() { } define void @s_shuffle_v4i64_v3i64__5_5_4_u() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_4_u: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_4_u: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_4_u: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_4_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> @@ -8580,12 +8047,12 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_u() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND @@ -8602,17 +8069,17 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[16:21] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -8622,17 +8089,17 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[16:21] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -8642,16 +8109,16 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: s_mov_b32 s14, s0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: s_mov_b32 s12, s8 +; GFX942-NEXT: s_mov_b32 s13, s9 ; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] @@ -8665,65 +8132,23 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_0() { } define void @s_shuffle_v4i64_v3i64__5_5_5_1() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_5_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_5_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_5_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_5_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> @@ -8732,65 +8157,25 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_1() { } define void @s_shuffle_v4i64_v3i64__5_5_5_2() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_5_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_5_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_5_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_5_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> @@ -8799,62 +8184,22 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_2() { } define void @s_shuffle_v4i64_v3i64__5_5_5_3() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_5_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s12, s20 -; GFX900-NEXT: s_mov_b32 s13, s21 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_5_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s12, s20 -; GFX90A-NEXT: s_mov_b32 s13, s21 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_5_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_5_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> @@ -8863,58 +8208,22 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_3() { } define void @s_shuffle_v4i64_v3i64__5_5_5_4() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_5_4: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_5_4: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_5_4: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_5_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> @@ -8927,14 +8236,14 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_5() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s15, s9 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND @@ -8947,56 +8256,20 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_5() { } define void @s_shuffle_v4i64_v3i64__u_0_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__u_0_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__u_0_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__u_0_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4i64_v3i64__u_0_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) @@ -9027,172 +8300,427 @@ define void @s_shuffle_v4i64_v3i64__0_0_0_0() { } define void @s_shuffle_v4i64_v3i64__1_0_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__1_0_0_0: +; GFX9-LABEL: s_shuffle_v4i64_v3i64__1_0_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__2_0_0_0() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__2_0_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__3_0_0_0() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_0_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__4_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__4_0_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__1_0_0_0: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__4_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__1_0_0_0: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__4_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:17] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s2 ; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__2_0_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__2_0_0_0: +define void @s_shuffle_v4i64_v3i64__5_0_0_0() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_0_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_u_0_0() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_u_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_1_0_0() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_1_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s14 +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_2_0_0() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_2_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s16 +; GFX9-NEXT: s_mov_b32 s11, s17 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_3_0_0() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_3_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_4_0_0() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_4_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_0_0() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_u_0() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__2_0_0_0: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__2_0_0_0: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 ; GFX942-NEXT: s_mov_b32 s14, s0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 ; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__3_0_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__3_0_0_0: +define void @s_shuffle_v4i64_v3i64__5_5_1_0() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__3_0_0_0: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__3_0_0_0: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_1_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: s_mov_b32 s14, s0 ; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART @@ -9200,72 +8728,94 @@ define void @s_shuffle_v4i64_v3i64__3_0_0_0() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__4_0_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__4_0_0_0: +define void @s_shuffle_v4i64_v3i64__5_5_2_0() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_2_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[16:21] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s20 +; GFX9-NEXT: s_mov_b32 s13, s21 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_3_0() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[16:21] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 ; GFX900-NEXT: s_mov_b32 s12, s4 ; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__4_0_0_0: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_3_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[16:21] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 ; GFX90A-NEXT: s_mov_b32 s12, s4 ; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__4_0_0_0: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_3_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s14, s0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 ; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] @@ -9273,70 +8823,66 @@ define void @s_shuffle_v4i64_v3i64__4_0_0_0() { ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_0_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_0_0_0: +define void @s_shuffle_v4i64_v3i64__5_5_4_0() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[16:21] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_0_0_0: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_4_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[16:21] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_0_0_0: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_4_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s14, s0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 ; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] @@ -9344,3422 +8890,640 @@ define void @s_shuffle_v4i64_v3i64__5_0_0_0() { ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_u_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_u_0_0: +define void @s_shuffle_v4i64_v3i64__u_1_1_1() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__u_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__0_1_1_1() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__0_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__1_1_1_1() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__1_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__2_1_1_1() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__2_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__3_1_1_1() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__4_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__4_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_u_0_0: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__4_1_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_u_0_0: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__4_1_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_1_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_1_0_0: +define void @s_shuffle_v4i64_v3i64__5_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s6 -; GFX900-NEXT: s_mov_b32 s11, s7 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_1_0_0: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_1_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s6 -; GFX90A-NEXT: s_mov_b32 s11, s7 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_1_0_0: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_1_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s2 +; GFX942-NEXT: s_mov_b32 s12, s10 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s11, s3 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_2_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_2_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_2_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_2_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4i64_v3i64__5_u_1_1() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_u_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_3_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_3_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_3_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_3_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4i64_v3i64__5_0_1_1() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_0_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_4_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_4_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_4_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_4_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4i64_v3i64__5_2_1_1() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_2_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s16 +; GFX9-NEXT: s_mov_b32 s11, s17 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_5_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4i64_v3i64__5_3_1_1() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_3_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_5_u_0() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_u_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_u_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_u_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4i64_v3i64__5_4_1_1() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_4_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_5_1_0() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_1_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_1_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_1_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4i64_v3i64__5_5_1_1() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_5_2_0() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_2_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s20 -; GFX900-NEXT: s_mov_b32 s13, s21 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_2_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s20 -; GFX90A-NEXT: s_mov_b32 s13, s21 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_2_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4i64_v3i64__5_5_u_1() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_u_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_5_3_0() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_3_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_3_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_3_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:17] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s16 -; GFX942-NEXT: s_mov_b32 s9, s17 -; GFX942-NEXT: s_mov_b32 s10, s16 -; GFX942-NEXT: s_mov_b32 s11, s17 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4i64_v3i64__5_5_0_1() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_0_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_5_4_0() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_4_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_4_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_4_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:17] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s16 -; GFX942-NEXT: s_mov_b32 s9, s17 -; GFX942-NEXT: s_mov_b32 s10, s16 -; GFX942-NEXT: s_mov_b32 s11, s17 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4i64_v3i64__5_5_2_1() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_2_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s16 +; GFX9-NEXT: s_mov_b32 s13, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__u_1_1_1() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__u_1_1_1: +define void @s_shuffle_v4i64_v3i64__5_5_3_1() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_3_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[12:17] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__0_1_1_1() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__0_1_1_1: +define void @s_shuffle_v4i64_v3i64__5_5_4_1() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_4_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[12:17] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__1_1_1_1() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__1_1_1_1: +define void @s_shuffle_v4i64_v3i64__u_2_2_2() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__u_2_2_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:13] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__2_1_1_1() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__2_1_1_1: +define void @s_shuffle_v4i64_v3i64__0_2_2_2() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__0_2_2_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:13] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__3_1_1_1() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_1_1_1: +define void @s_shuffle_v4i64_v3i64__1_2_2_2() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__1_2_2_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:13] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__4_1_1_1() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__4_1_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 -; GFX900-NEXT: s_mov_b32 s14, s10 -; GFX900-NEXT: s_mov_b32 s15, s11 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__4_1_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 -; GFX90A-NEXT: s_mov_b32 s14, s10 -; GFX90A-NEXT: s_mov_b32 s15, s11 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__4_1_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 -; GFX942-NEXT: s_mov_b32 s14, s10 -; GFX942-NEXT: s_mov_b32 s15, s11 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_1_1_1() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_1_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 -; GFX900-NEXT: s_mov_b32 s14, s10 -; GFX900-NEXT: s_mov_b32 s15, s11 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_1_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 -; GFX90A-NEXT: s_mov_b32 s14, s10 -; GFX90A-NEXT: s_mov_b32 s15, s11 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_1_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 -; GFX942-NEXT: s_mov_b32 s14, s10 -; GFX942-NEXT: s_mov_b32 s15, s11 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_u_1_1() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_u_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_u_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_u_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_0_1_1() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_0_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_0_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_0_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_2_1_1() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_2_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_2_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_2_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_3_1_1() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_3_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_3_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_3_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_4_1_1() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_4_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_4_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_4_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_5_1_1() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_5_u_1() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_u_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_u_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_u_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_5_0_1() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_0_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_0_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_0_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_5_2_1() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_2_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_2_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_2_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_5_3_1() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_3_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_3_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_3_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:17] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s16 -; GFX942-NEXT: s_mov_b32 s9, s17 -; GFX942-NEXT: s_mov_b32 s10, s16 -; GFX942-NEXT: s_mov_b32 s11, s17 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_5_4_1() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_4_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_4_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_4_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:17] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s16 -; GFX942-NEXT: s_mov_b32 s9, s17 -; GFX942-NEXT: s_mov_b32 s10, s16 -; GFX942-NEXT: s_mov_b32 s11, s17 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__u_2_2_2() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__u_2_2_2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__0_2_2_2() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__0_2_2_2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__1_2_2_2() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__1_2_2_2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__2_2_2_2() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__2_2_2_2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__3_2_2_2() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_2_2_2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__4_2_2_2() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__4_2_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__4_2_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__4_2_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_2_2_2() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_2_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_2_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_2_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_u_2_2() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_u_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_u_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_u_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_0_2_2() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_0_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_0_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_0_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_1_2_2() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_1_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_1_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_1_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_3_2_2() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_3_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_3_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_3_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_4_2_2() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_4_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_4_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_4_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_5_2_2() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_5_u_2() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_u_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_u_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_u_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_5_0_2() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_0_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[20:25] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s24 -; GFX900-NEXT: s_mov_b32 s9, s25 -; GFX900-NEXT: s_mov_b32 s10, s24 -; GFX900-NEXT: s_mov_b32 s11, s25 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_0_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[20:25] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s24 -; GFX90A-NEXT: s_mov_b32 s9, s25 -; GFX90A-NEXT: s_mov_b32 s10, s24 -; GFX90A-NEXT: s_mov_b32 s11, s25 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_0_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_5_1_2() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_1_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_1_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_1_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_5_3_2() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_3_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[20:25] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s24 -; GFX900-NEXT: s_mov_b32 s9, s25 -; GFX900-NEXT: s_mov_b32 s10, s24 -; GFX900-NEXT: s_mov_b32 s11, s25 -; GFX900-NEXT: s_mov_b32 s12, s20 -; GFX900-NEXT: s_mov_b32 s13, s21 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_3_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[20:25] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s24 -; GFX90A-NEXT: s_mov_b32 s9, s25 -; GFX90A-NEXT: s_mov_b32 s10, s24 -; GFX90A-NEXT: s_mov_b32 s11, s25 -; GFX90A-NEXT: s_mov_b32 s12, s20 -; GFX90A-NEXT: s_mov_b32 s13, s21 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_3_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:17] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s16 -; GFX942-NEXT: s_mov_b32 s9, s17 -; GFX942-NEXT: s_mov_b32 s10, s16 -; GFX942-NEXT: s_mov_b32 s11, s17 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_5_4_2() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_4_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[20:25] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s24 -; GFX900-NEXT: s_mov_b32 s9, s25 -; GFX900-NEXT: s_mov_b32 s10, s24 -; GFX900-NEXT: s_mov_b32 s11, s25 -; GFX900-NEXT: s_mov_b32 s12, s22 -; GFX900-NEXT: s_mov_b32 s13, s23 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_4_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[20:25] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s24 -; GFX90A-NEXT: s_mov_b32 s9, s25 -; GFX90A-NEXT: s_mov_b32 s10, s24 -; GFX90A-NEXT: s_mov_b32 s11, s25 -; GFX90A-NEXT: s_mov_b32 s12, s22 -; GFX90A-NEXT: s_mov_b32 s13, s23 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_4_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:17] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s16 -; GFX942-NEXT: s_mov_b32 s9, s17 -; GFX942-NEXT: s_mov_b32 s10, s16 -; GFX942-NEXT: s_mov_b32 s11, s17 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__u_3_3_3() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__u_3_3_3: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__0_3_3_3() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__0_3_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__0_3_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__0_3_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__1_3_3_3() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__1_3_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__1_3_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__1_3_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__2_3_3_3() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__2_3_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__2_3_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__2_3_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__3_3_3_3() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_3_3_3: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__4_3_3_3() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__4_3_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__4_3_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__4_3_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_3_3_3() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_3_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_3_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_3_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_u_3_3() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_u_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_u_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_u_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_0_3_3() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_0_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_0_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_0_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_1_3_3() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_1_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_1_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_1_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_2_3_3() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_2_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_2_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_2_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[16:21] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s20 -; GFX942-NEXT: s_mov_b32 s9, s21 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s16 -; GFX942-NEXT: s_mov_b32 s13, s17 -; GFX942-NEXT: s_mov_b32 s14, s16 -; GFX942-NEXT: s_mov_b32 s15, s17 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4i64_v3i64__2_2_2_2() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__2_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s15, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_4_3_3() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_4_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s6 -; GFX900-NEXT: s_mov_b32 s11, s7 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_4_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s6 -; GFX90A-NEXT: s_mov_b32 s11, s7 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_4_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s2 -; GFX942-NEXT: s_mov_b32 s11, s3 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4i64_v3i64__3_2_2_2() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_5_3_3() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_3_3: +define void @s_shuffle_v4i64_v3i64__4_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__4_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__5_5_u_3() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_u_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: s_mov_b32 s14, s12 ; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART @@ -12767,16 +9531,19 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_3() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_u_3: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__4_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: s_mov_b32 s14, s12 ; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART @@ -12784,1637 +9551,1983 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_u_3: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__4_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_5_0_3() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_0_3: +define void @s_shuffle_v4i64_v3i64__5_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_0_3: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_0_3: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[16:21] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s20 -; GFX942-NEXT: s_mov_b32 s9, s21 -; GFX942-NEXT: s_mov_b32 s10, s20 -; GFX942-NEXT: s_mov_b32 s11, s21 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s16 -; GFX942-NEXT: s_mov_b32 s15, s17 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_5_1_3() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_1_3: +define void @s_shuffle_v4i64_v3i64__5_u_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_u_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_1_3: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_u_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_1_3: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_u_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[16:21] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s20 -; GFX942-NEXT: s_mov_b32 s9, s21 -; GFX942-NEXT: s_mov_b32 s10, s20 -; GFX942-NEXT: s_mov_b32 s11, s21 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s16 -; GFX942-NEXT: s_mov_b32 s15, s17 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_5_2_3() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_2_3: +define void @s_shuffle_v4i64_v3i64__5_0_2_2() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_0_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s12, s16 +; GFX9-NEXT: s_mov_b32 s13, s17 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_1_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_2_3: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_1_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_2_3: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_1_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_5_4_3() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_4_3: +define void @s_shuffle_v4i64_v3i64__5_3_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s12, s18 -; GFX900-NEXT: s_mov_b32 s13, s19 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_4_3: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_3_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s12, s18 -; GFX90A-NEXT: s_mov_b32 s13, s19 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_4_3: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_3_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 ; GFX942-NEXT: s_mov_b32 s10, s4 ; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__u_4_4_4() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__u_4_4_4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__0_4_4_4() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__0_4_4_4: +define void @s_shuffle_v4i64_v3i64__5_4_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_4_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__0_4_4_4: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_4_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__0_4_4_4: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_4_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s2 -; GFX942-NEXT: s_mov_b32 s11, s3 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_mov_b32 s10, s6 +; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__1_4_4_4() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__1_4_4_4: +define void @s_shuffle_v4i64_v3i64__5_5_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 -; GFX900-NEXT: s_mov_b32 s14, s10 -; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__1_4_4_4: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 -; GFX90A-NEXT: s_mov_b32 s14, s10 -; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__1_4_4_4: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 -; GFX942-NEXT: s_mov_b32 s14, s10 -; GFX942-NEXT: s_mov_b32 s15, s11 +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__2_4_4_4() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__2_4_4_4: +define void @s_shuffle_v4i64_v3i64__5_5_u_2() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 -; GFX900-NEXT: s_mov_b32 s14, s10 -; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__2_4_4_4: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 -; GFX90A-NEXT: s_mov_b32 s14, s10 -; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__2_4_4_4: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 -; GFX942-NEXT: s_mov_b32 s14, s10 -; GFX942-NEXT: s_mov_b32 s15, s11 +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__3_4_4_4() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_4_4_4: +define void @s_shuffle_v4i64_v3i64__5_5_0_2() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_0_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[12:17] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__4_4_4_4() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__4_4_4_4: +define void @s_shuffle_v4i64_v3i64__5_5_1_2() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_1_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_3_2() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_3_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_4_2() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_4_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[12:17] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_4_4_4() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_4_4_4: +define void @s_shuffle_v4i64_v3i64__u_3_3_3() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__u_3_3_3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_u_4_4() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_u_4_4: +define void @s_shuffle_v4i64_v3i64__0_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__0_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_u_4_4: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__0_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_u_4_4: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__0_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_0_4_4() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_0_4_4: +define void @s_shuffle_v4i64_v3i64__1_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__1_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_0_4_4: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__1_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_0_4_4: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__1_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: s_mov_b32 s14, s6 -; GFX942-NEXT: s_mov_b32 s15, s7 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() - %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_1_4_4() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_1_4_4: +define void @s_shuffle_v4i64_v3i64__2_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__2_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_1_4_4: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__2_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_1_4_4: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__2_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__3_3_3_3() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__4_3_3_3() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__4_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_2_4_4() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_2_4_4: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_2_4_4: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_2_4_4: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:17] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s16 -; GFX942-NEXT: s_mov_b32 s9, s17 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4i64_v3i64__5_3_3_3() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_3_4_4() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_3_4_4: +define void @s_shuffle_v4i64_v3i64__5_u_3_3() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_u_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_0_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_0_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_3_4_4: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_0_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_3_4_4: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_0_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 ; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_5_4_4() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_4_4: +define void @s_shuffle_v4i64_v3i64__5_1_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_1_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_4_4: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_1_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_4_4: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_1_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_5_u_4() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_u_4: +define void @s_shuffle_v4i64_v3i64__5_2_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_2_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_u_4: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_u_4: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_5_0_4() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_0_4: +define void @s_shuffle_v4i64_v3i64__5_4_3_3() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_4_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_3_3() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_u_3() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_u_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_0_3() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_0_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_1_3() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_1_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_0_4: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_1_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_0_4: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_1_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:17] +; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s16 -; GFX942-NEXT: s_mov_b32 s9, s17 -; GFX942-NEXT: s_mov_b32 s10, s16 -; GFX942-NEXT: s_mov_b32 s11, s17 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_5_1_4() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_1_4: +define void @s_shuffle_v4i64_v3i64__5_5_2_3() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_1_4: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_1_4: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:17] +; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s16 -; GFX942-NEXT: s_mov_b32 s9, s17 -; GFX942-NEXT: s_mov_b32 s10, s16 -; GFX942-NEXT: s_mov_b32 s11, s17 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_5_2_4() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_2_4: +define void @s_shuffle_v4i64_v3i64__5_5_4_3() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_4_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__u_4_4_4() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__u_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__0_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__0_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s14, s18 -; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_2_4: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__0_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s14, s18 -; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_2_4: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__0_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_5_3_4() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_3_4: +define void @s_shuffle_v4i64_v3i64__1_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__1_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_3_4: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__1_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_3_4: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__1_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__u_5_5_5() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__u_5_5_5: +define void @s_shuffle_v4i64_v3i64__2_4_4_4() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__2_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s14 +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__3_4_4_4() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_4_4_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:13] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__0_5_5_5() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__0_5_5_5: +define void @s_shuffle_v4i64_v3i64__4_4_4_4() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__4_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_4_4_4() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_u_4_4() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_u_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_0_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_0_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__0_5_5_5: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_0_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__0_5_5_5: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_0_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s14, s6 +; GFX942-NEXT: s_mov_b32 s15, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__1_5_5_5() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__1_5_5_5: +define void @s_shuffle_v4i64_v3i64__5_1_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_1_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__1_5_5_5: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_1_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__1_5_5_5: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_1_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s14, s6 +; GFX942-NEXT: s_mov_b32 s15, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__2_5_5_5() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__2_5_5_5: +define void @s_shuffle_v4i64_v3i64__5_2_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_2_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: s_mov_b32 s10, s12 ; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__2_5_5_5: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_2_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: s_mov_b32 s10, s12 ; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__2_5_5_5: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_2_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s14, s6 +; GFX942-NEXT: s_mov_b32 s15, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__3_5_5_5() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_5_5_5: +define void @s_shuffle_v4i64_v3i64__5_3_4_4() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_3_4_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__4_5_5_5() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__4_5_5_5: +define void @s_shuffle_v4i64_v3i64__5_5_4_4() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_4_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_u_5_5() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_u_5_5: +define void @s_shuffle_v4i64_v3i64__5_5_u_4() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_u_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_0_5_5() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_0_5_5: +define void @s_shuffle_v4i64_v3i64__5_5_0_4() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_0_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_1_4() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_1_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_0_5_5: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_1_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_1_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s14, s6 +; GFX942-NEXT: s_mov_b32 s15, s7 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_2_4() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_0_5_5: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_2_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: s_mov_b32 s14, s6 +; GFX942-NEXT: s_mov_b32 s15, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_1_5_5() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_1_5_5: +define void @s_shuffle_v4i64_v3i64__5_5_3_4() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_3_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__u_5_5_5() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__u_5_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__0_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__0_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -14423,8 +11536,8 @@ define void @s_shuffle_v4i64_v3i64__5_1_5_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: s_mov_b32 s12, s16 ; GFX900-NEXT: s_mov_b32 s13, s17 ; GFX900-NEXT: s_mov_b32 s14, s16 @@ -14434,7 +11547,7 @@ define void @s_shuffle_v4i64_v3i64__5_1_5_5() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_1_5_5: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__0_5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -14443,8 +11556,8 @@ define void @s_shuffle_v4i64_v3i64__5_1_5_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: s_mov_b32 s12, s16 ; GFX90A-NEXT: s_mov_b32 s13, s17 ; GFX90A-NEXT: s_mov_b32 s14, s16 @@ -14454,7 +11567,7 @@ define void @s_shuffle_v4i64_v3i64__5_1_5_5() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_1_5_5: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__0_5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -14463,8 +11576,8 @@ define void @s_shuffle_v4i64_v3i64__5_1_5_5() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 ; GFX942-NEXT: s_mov_b32 s12, s4 ; GFX942-NEXT: s_mov_b32 s13, s5 ; GFX942-NEXT: s_mov_b32 s14, s4 @@ -14475,25 +11588,25 @@ define void @s_shuffle_v4i64_v3i64__5_1_5_5() { ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_2_5_5() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_2_5_5: +define void @s_shuffle_v4i64_v3i64__1_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__1_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: s_mov_b32 s14, s12 ; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART @@ -14501,19 +11614,19 @@ define void @s_shuffle_v4i64_v3i64__5_2_5_5() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_2_5_5: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__1_5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: s_mov_b32 s14, s12 ; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART @@ -14521,7 +11634,7 @@ define void @s_shuffle_v4i64_v3i64__5_2_5_5() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_2_5_5: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__1_5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -14530,10 +11643,10 @@ define void @s_shuffle_v4i64_v3i64__5_2_5_5() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: s_mov_b32 s14, s12 ; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART @@ -14542,22 +11655,23 @@ define void @s_shuffle_v4i64_v3i64__5_2_5_5() { ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_3_5_5() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_3_5_5: +define void @s_shuffle_v4i64_v3i64__2_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__2_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: s_mov_b32 s12, s16 ; GFX900-NEXT: s_mov_b32 s13, s17 ; GFX900-NEXT: s_mov_b32 s14, s16 @@ -14567,16 +11681,17 @@ define void @s_shuffle_v4i64_v3i64__5_3_5_5() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_3_5_5: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__2_5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: s_mov_b32 s12, s16 ; GFX90A-NEXT: s_mov_b32 s13, s17 ; GFX90A-NEXT: s_mov_b32 s14, s16 @@ -14586,16 +11701,18 @@ define void @s_shuffle_v4i64_v3i64__5_3_5_5() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_3_5_5: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__2_5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 ; GFX942-NEXT: s_mov_b32 s12, s4 ; GFX942-NEXT: s_mov_b32 s13, s5 ; GFX942-NEXT: s_mov_b32 s14, s4 @@ -14606,20 +11723,20 @@ define void @s_shuffle_v4i64_v3i64__5_3_5_5() { ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_4_5_5() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_4_5_5: +define void @s_shuffle_v4i64_v3i64__3_5_5_5() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_5_5_5: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:13] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: s_mov_b32 s14, s12 ; GFX9-NEXT: s_mov_b32 s15, s13 ; GFX9-NEXT: ;;#ASMSTART @@ -14628,399 +11745,518 @@ define void @s_shuffle_v4i64_v3i64__5_4_5_5() { ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_5_u_5() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_u_5: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_u_5: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_u_5: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4i64_v3i64__4_5_5_5() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__4_5_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_5_0_5() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_0_5: +define void @s_shuffle_v4i64_v3i64__5_u_5_5() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_u_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s15, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_0_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_0_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_0_5: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_0_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_0_5: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_0_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:17] +; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s16 -; GFX942-NEXT: s_mov_b32 s9, s17 -; GFX942-NEXT: s_mov_b32 s10, s16 -; GFX942-NEXT: s_mov_b32 s11, s17 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s16 -; GFX942-NEXT: s_mov_b32 s15, s17 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s12, s8 +; GFX942-NEXT: s_mov_b32 s13, s9 +; GFX942-NEXT: s_mov_b32 s14, s8 +; GFX942-NEXT: s_mov_b32 s15, s9 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_5_1_5() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_1_5: +define void @s_shuffle_v4i64_v3i64__5_1_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_1_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_1_5: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_1_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_1_5: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_1_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:17] +; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s16 -; GFX942-NEXT: s_mov_b32 s9, s17 -; GFX942-NEXT: s_mov_b32 s10, s16 -; GFX942-NEXT: s_mov_b32 s11, s17 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s16 -; GFX942-NEXT: s_mov_b32 s15, s17 +; GFX942-NEXT: s_mov_b32 s12, s8 +; GFX942-NEXT: s_mov_b32 s13, s9 +; GFX942-NEXT: s_mov_b32 s14, s8 +; GFX942-NEXT: s_mov_b32 s15, s9 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_5_2_5() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_2_5: +define void @s_shuffle_v4i64_v3i64__5_2_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_2_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s14, s20 -; GFX900-NEXT: s_mov_b32 s15, s21 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_2_5: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_2_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s14, s20 -; GFX90A-NEXT: s_mov_b32 s15, s21 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_2_5: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_2_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s12, s8 +; GFX942-NEXT: s_mov_b32 s13, s9 +; GFX942-NEXT: s_mov_b32 s14, s8 +; GFX942-NEXT: s_mov_b32 s15, s9 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_5_3_5() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_3_5: +define void @s_shuffle_v4i64_v3i64__5_3_5_5() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_3_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s15, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_4_5_5() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_4_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s15, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_u_5() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_u_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s15, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_0_5() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_0_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s15, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_1_5() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_1_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_3_5: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_1_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_3_5: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_1_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s14, s8 +; GFX942-NEXT: s_mov_b32 s15, s9 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v3i64__5_5_4_5() { -; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_4_5: +define void @s_shuffle_v4i64_v3i64__5_5_2_5() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_2_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_4_5: +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_2_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_4_5: +; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_2_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: s_mov_b32 s14, s8 +; GFX942-NEXT: s_mov_b32 s15, s9 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_3_5() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_3_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s15, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_4_5() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_4_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s15, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll index 32f6e00716e37..519c90672016d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll @@ -139,39 +139,33 @@ define void @v_shuffle_v4i64_v4i64__3_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__3_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__3_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__3_u_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -275,39 +269,33 @@ define void @v_shuffle_v4i64_v4i64__7_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_u_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -321,55 +309,42 @@ define void @v_shuffle_v4i64_v4i64__7_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_0_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_0_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_0_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -383,49 +358,43 @@ define void @v_shuffle_v4i64_v4i64__7_1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_1_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_1_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -439,49 +408,43 @@ define void @v_shuffle_v4i64_v4i64__7_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_2_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -495,49 +458,43 @@ define void @v_shuffle_v4i64_v4i64__7_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_3_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_3_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -551,45 +508,40 @@ define void @v_shuffle_v4i64_v4i64__7_4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_4_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_4_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_4_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -603,39 +555,40 @@ define void @v_shuffle_v4i64_v4i64__7_5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_5_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_5_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_5_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -649,39 +602,40 @@ define void @v_shuffle_v4i64_v4i64__7_6_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_6_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_6_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_6_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -695,39 +649,40 @@ define void @v_shuffle_v4i64_v4i64__7_7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -741,51 +696,54 @@ define void @v_shuffle_v4i64_v4i64__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_0_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_0_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -799,51 +757,54 @@ define void @v_shuffle_v4i64_v4i64__7_7_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_1_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_1_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -857,51 +818,54 @@ define void @v_shuffle_v4i64_v4i64__7_7_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -915,57 +879,52 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v10, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, v14 +; GFX90A-NEXT: v_mov_b32_e32 v17, v15 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v16, v14 +; GFX942-NEXT: v_mov_b32_e32 v17, v15 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -979,42 +938,42 @@ define void @v_shuffle_v4i64_v4i64__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1028,45 +987,42 @@ define void @v_shuffle_v4i64_v4i64__7_7_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1080,45 +1036,42 @@ define void @v_shuffle_v4i64_v4i64__7_7_6_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_6_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_6_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_6_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1132,48 +1085,45 @@ define void @v_shuffle_v4i64_v4i64__7_7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_7_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_7_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_7_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1187,64 +1137,54 @@ define void @v_shuffle_v4i64_v4i64__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_7_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v8, v6 ; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_7_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_7_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1258,58 +1198,55 @@ define void @v_shuffle_v4i64_v4i64__7_7_7_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_7_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_7_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_7_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1323,58 +1260,55 @@ define void @v_shuffle_v4i64_v4i64__7_7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_7_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_7_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_7_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1388,57 +1322,55 @@ define void @v_shuffle_v4i64_v4i64__7_7_7_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_7_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_7_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_7_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1452,57 +1384,52 @@ define void @v_shuffle_v4i64_v4i64__7_7_7_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_7_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_7_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_7_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1516,48 +1443,52 @@ define void @v_shuffle_v4i64_v4i64__7_7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_7_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1571,51 +1502,52 @@ define void @v_shuffle_v4i64_v4i64__7_7_7_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_7_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_7_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_7_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1629,42 +1561,43 @@ define void @v_shuffle_v4i64_v4i64__7_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1679,13 +1612,13 @@ define void @v_shuffle_v4i64_v4i64__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1693,13 +1626,13 @@ define void @v_shuffle_v4i64_v4i64__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1707,13 +1640,13 @@ define void @v_shuffle_v4i64_v4i64__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1775,15 +1708,15 @@ define void @v_shuffle_v4i64_v4i64__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1791,15 +1724,15 @@ define void @v_shuffle_v4i64_v4i64__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1807,15 +1740,15 @@ define void @v_shuffle_v4i64_v4i64__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1885,15 +1818,13 @@ define void @v_shuffle_v4i64_v4i64__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1903,15 +1834,13 @@ define void @v_shuffle_v4i64_v4i64__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1921,15 +1850,13 @@ define void @v_shuffle_v4i64_v4i64__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1943,13 +1870,13 @@ define void @v_shuffle_v4i64_v4i64__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1957,13 +1884,13 @@ define void @v_shuffle_v4i64_v4i64__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1971,13 +1898,13 @@ define void @v_shuffle_v4i64_v4i64__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2121,20 +2048,16 @@ define void @v_shuffle_v4i64_v4i64__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2142,20 +2065,16 @@ define void @v_shuffle_v4i64_v4i64__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2163,21 +2082,16 @@ define void @v_shuffle_v4i64_v4i64__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2192,19 +2106,16 @@ define void @v_shuffle_v4i64_v4i64__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2212,19 +2123,16 @@ define void @v_shuffle_v4i64_v4i64__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2232,20 +2140,16 @@ define void @v_shuffle_v4i64_v4i64__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2260,64 +2164,58 @@ define void @v_shuffle_v4i64_v4i64__7_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v14, v4 +; GFX90A-NEXT: v_mov_b32_e32 v15, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v14, v4 +; GFX942-NEXT: v_mov_b32_e32 v15, v5 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2332,19 +2230,19 @@ define void @v_shuffle_v4i64_v4i64__7_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2352,19 +2250,19 @@ define void @v_shuffle_v4i64_v4i64__7_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v12 +; GFX90A-NEXT: v_mov_b32_e32 v9, v13 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2372,19 +2270,19 @@ define void @v_shuffle_v4i64_v4i64__7_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v12 +; GFX942-NEXT: v_mov_b32_e32 v9, v13 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2399,18 +2297,19 @@ define void @v_shuffle_v4i64_v4i64__7_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v14 +; GFX900-NEXT: v_mov_b32_e32 v9, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2418,18 +2317,19 @@ define void @v_shuffle_v4i64_v4i64__7_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v14 +; GFX90A-NEXT: v_mov_b32_e32 v9, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2437,18 +2337,19 @@ define void @v_shuffle_v4i64_v4i64__7_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v14 +; GFX942-NEXT: v_mov_b32_e32 v9, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2463,21 +2364,19 @@ define void @v_shuffle_v4i64_v4i64__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2485,21 +2384,19 @@ define void @v_shuffle_v4i64_v4i64__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2507,21 +2404,19 @@ define void @v_shuffle_v4i64_v4i64__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2536,19 +2431,19 @@ define void @v_shuffle_v4i64_v4i64__7_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2556,19 +2451,19 @@ define void @v_shuffle_v4i64_v4i64__7_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2576,20 +2471,19 @@ define void @v_shuffle_v4i64_v4i64__7_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2604,18 +2498,19 @@ define void @v_shuffle_v4i64_v4i64__7_6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v8 -; GFX900-NEXT: v_mov_b32_e32 v5, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2623,18 +2518,19 @@ define void @v_shuffle_v4i64_v4i64__7_6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v8 -; GFX90A-NEXT: v_mov_b32_e32 v5, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2642,19 +2538,19 @@ define void @v_shuffle_v4i64_v4i64__7_6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v8 -; GFX942-NEXT: v_mov_b32_e32 v5, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2669,18 +2565,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2688,18 +2585,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2707,19 +2605,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2733,19 +2631,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v0 -; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2753,18 +2650,16 @@ define void @v_shuffle_v4i64_v4i64__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2772,19 +2667,16 @@ define void @v_shuffle_v4i64_v4i64__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 +; GFX942-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2799,18 +2691,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, v8 +; GFX900-NEXT: v_mov_b32_e32 v13, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2818,18 +2710,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v12, v8 +; GFX90A-NEXT: v_mov_b32_e32 v13, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2837,19 +2729,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v12, v8 +; GFX942-NEXT: v_mov_b32_e32 v13, v9 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2864,18 +2755,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v14, v8 +; GFX900-NEXT: v_mov_b32_e32 v15, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2883,18 +2774,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v14, v8 +; GFX90A-NEXT: v_mov_b32_e32 v15, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2902,19 +2793,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_2_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v14, v8 +; GFX942-NEXT: v_mov_b32_e32 v15, v9 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2928,21 +2818,20 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2950,20 +2839,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v16, v14 +; GFX90A-NEXT: v_mov_b32_e32 v17, v15 +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2971,20 +2858,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v16, v14 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v17, v15 +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2999,18 +2884,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3018,18 +2903,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3037,19 +2922,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v8 +; GFX942-NEXT: v_mov_b32_e32 v3, v9 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -3064,19 +2948,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3084,19 +2967,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3104,20 +2986,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v8 +; GFX942-NEXT: v_mov_b32_e32 v5, v9 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -3132,20 +3012,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_6_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3153,20 +3031,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_6_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 +; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3174,26 +3050,24 @@ define void @v_shuffle_v4i64_v4i64__7_7_6_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] -; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=v"() - %vec1 = call <4 x i64> asm "; def $0", "=v"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: v_mov_b32_e32 v1, v9 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 +; GFX942-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 ret void } @@ -3403,13 +3277,13 @@ define void @v_shuffle_v4i64_v4i64__3_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3419,13 +3293,13 @@ define void @v_shuffle_v4i64_v4i64__3_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3435,13 +3309,13 @@ define void @v_shuffle_v4i64_v4i64__3_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 ; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -3633,18 +3507,16 @@ define void @v_shuffle_v4i64_v4i64__7_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3652,18 +3524,16 @@ define void @v_shuffle_v4i64_v4i64__7_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3671,19 +3541,16 @@ define void @v_shuffle_v4i64_v4i64__7_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -3698,18 +3565,16 @@ define void @v_shuffle_v4i64_v4i64__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3717,18 +3582,16 @@ define void @v_shuffle_v4i64_v4i64__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3736,19 +3599,16 @@ define void @v_shuffle_v4i64_v4i64__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -3763,20 +3623,16 @@ define void @v_shuffle_v4i64_v4i64__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3784,21 +3640,16 @@ define void @v_shuffle_v4i64_v4i64__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3806,22 +3657,16 @@ define void @v_shuffle_v4i64_v4i64__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -3836,19 +3681,19 @@ define void @v_shuffle_v4i64_v4i64__7_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3856,19 +3701,19 @@ define void @v_shuffle_v4i64_v4i64__7_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v12 +; GFX90A-NEXT: v_mov_b32_e32 v9, v13 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3876,19 +3721,19 @@ define void @v_shuffle_v4i64_v4i64__7_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v10 +; GFX942-NEXT: v_mov_b32_e32 v9, v11 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v12 +; GFX942-NEXT: v_mov_b32_e32 v9, v13 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -3903,19 +3748,19 @@ define void @v_shuffle_v4i64_v4i64__7_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3923,19 +3768,19 @@ define void @v_shuffle_v4i64_v4i64__7_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v12 +; GFX90A-NEXT: v_mov_b32_e32 v9, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3943,19 +3788,19 @@ define void @v_shuffle_v4i64_v4i64__7_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v12 +; GFX942-NEXT: v_mov_b32_e32 v9, v13 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -3970,20 +3815,19 @@ define void @v_shuffle_v4i64_v4i64__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v12 -; GFX900-NEXT: v_mov_b32_e32 v1, v13 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3991,20 +3835,19 @@ define void @v_shuffle_v4i64_v4i64__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v12 -; GFX90A-NEXT: v_mov_b32_e32 v1, v13 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4012,20 +3855,19 @@ define void @v_shuffle_v4i64_v4i64__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v14, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v12 -; GFX942-NEXT: v_mov_b32_e32 v1, v13 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -4040,19 +3882,19 @@ define void @v_shuffle_v4i64_v4i64__7_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v10 -; GFX900-NEXT: v_mov_b32_e32 v5, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4060,19 +3902,19 @@ define void @v_shuffle_v4i64_v4i64__7_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v10 -; GFX90A-NEXT: v_mov_b32_e32 v5, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4080,20 +3922,19 @@ define void @v_shuffle_v4i64_v4i64__7_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v10 -; GFX942-NEXT: v_mov_b32_e32 v5, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -4108,18 +3949,19 @@ define void @v_shuffle_v4i64_v4i64__7_6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v10 -; GFX900-NEXT: v_mov_b32_e32 v7, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4127,18 +3969,19 @@ define void @v_shuffle_v4i64_v4i64__7_6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v10 -; GFX90A-NEXT: v_mov_b32_e32 v7, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4146,19 +3989,19 @@ define void @v_shuffle_v4i64_v4i64__7_6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v10 -; GFX942-NEXT: v_mov_b32_e32 v7, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -4173,18 +4016,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4192,18 +4036,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4211,19 +4056,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -4237,51 +4082,55 @@ define void @v_shuffle_v4i64_v4i64__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_u_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_u_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -4295,51 +4144,54 @@ define void @v_shuffle_v4i64_v4i64__7_7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_0_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_0_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -4354,18 +4206,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v8 +; GFX900-NEXT: v_mov_b32_e32 v13, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4373,18 +4225,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, v8 +; GFX90A-NEXT: v_mov_b32_e32 v13, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4392,19 +4244,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_2_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v12, v8 +; GFX942-NEXT: v_mov_b32_e32 v13, v9 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -4418,19 +4269,20 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4438,18 +4290,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v16, v14 +; GFX90A-NEXT: v_mov_b32_e32 v17, v15 +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4457,18 +4309,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v16, v14 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: v_mov_b32_e32 v17, v15 +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -4483,18 +4335,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4502,18 +4354,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4521,19 +4373,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v8 +; GFX942-NEXT: v_mov_b32_e32 v3, v9 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -4548,19 +4400,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v2 -; GFX900-NEXT: v_mov_b32_e32 v9, v3 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4568,19 +4419,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4588,20 +4438,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v8 +; GFX942-NEXT: v_mov_b32_e32 v5, v9 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -4616,18 +4465,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_6_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4635,18 +4485,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_6_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4654,19 +4505,20 @@ define void @v_shuffle_v4i64_v4i64__7_7_6_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -4881,14 +4733,13 @@ define void @v_shuffle_v4i64_v4i64__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4898,14 +4749,13 @@ define void @v_shuffle_v4i64_v4i64__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4915,14 +4765,13 @@ define void @v_shuffle_v4i64_v4i64__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -5114,18 +4963,16 @@ define void @v_shuffle_v4i64_v4i64__7_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5133,18 +4980,16 @@ define void @v_shuffle_v4i64_v4i64__7_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5152,19 +4997,16 @@ define void @v_shuffle_v4i64_v4i64__7_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -5179,18 +5021,16 @@ define void @v_shuffle_v4i64_v4i64__7_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v12 -; GFX900-NEXT: v_mov_b32_e32 v1, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5198,18 +5038,16 @@ define void @v_shuffle_v4i64_v4i64__7_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v0, v12 -; GFX90A-NEXT: v_mov_b32_e32 v1, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5217,19 +5055,16 @@ define void @v_shuffle_v4i64_v4i64__7_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v0, v12 -; GFX942-NEXT: v_mov_b32_e32 v1, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -5244,20 +5079,16 @@ define void @v_shuffle_v4i64_v4i64__7_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v12 +; GFX900-NEXT: v_mov_b32_e32 v15, v13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5265,20 +5096,16 @@ define void @v_shuffle_v4i64_v4i64__7_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v12 +; GFX90A-NEXT: v_mov_b32_e32 v15, v13 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5286,21 +5113,16 @@ define void @v_shuffle_v4i64_v4i64__7_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v12 +; GFX942-NEXT: v_mov_b32_e32 v15, v13 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -5315,18 +5137,16 @@ define void @v_shuffle_v4i64_v4i64__7_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v12 -; GFX900-NEXT: v_mov_b32_e32 v1, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5334,18 +5154,16 @@ define void @v_shuffle_v4i64_v4i64__7_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v0, v12 -; GFX90A-NEXT: v_mov_b32_e32 v1, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5353,19 +5171,16 @@ define void @v_shuffle_v4i64_v4i64__7_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v0, v12 -; GFX942-NEXT: v_mov_b32_e32 v1, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -5380,19 +5195,19 @@ define void @v_shuffle_v4i64_v4i64__7_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5400,19 +5215,19 @@ define void @v_shuffle_v4i64_v4i64__7_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v12 +; GFX90A-NEXT: v_mov_b32_e32 v9, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5420,19 +5235,19 @@ define void @v_shuffle_v4i64_v4i64__7_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v10 +; GFX942-NEXT: v_mov_b32_e32 v9, v11 +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v12 +; GFX942-NEXT: v_mov_b32_e32 v9, v13 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -5447,20 +5262,19 @@ define void @v_shuffle_v4i64_v4i64__7_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v14 -; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5468,20 +5282,19 @@ define void @v_shuffle_v4i64_v4i64__7_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v0, v14 -; GFX90A-NEXT: v_mov_b32_e32 v1, v15 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5489,20 +5302,19 @@ define void @v_shuffle_v4i64_v4i64__7_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v0, v14 -; GFX942-NEXT: v_mov_b32_e32 v1, v15 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -5517,19 +5329,19 @@ define void @v_shuffle_v4i64_v4i64__7_5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v12 -; GFX900-NEXT: v_mov_b32_e32 v7, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5537,19 +5349,19 @@ define void @v_shuffle_v4i64_v4i64__7_5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v12 -; GFX90A-NEXT: v_mov_b32_e32 v7, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5557,20 +5369,19 @@ define void @v_shuffle_v4i64_v4i64__7_5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v6, v12 -; GFX942-NEXT: v_mov_b32_e32 v7, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -5585,18 +5396,19 @@ define void @v_shuffle_v4i64_v4i64__7_6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v8, v12 -; GFX900-NEXT: v_mov_b32_e32 v9, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5604,18 +5416,19 @@ define void @v_shuffle_v4i64_v4i64__7_6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v8, v12 -; GFX90A-NEXT: v_mov_b32_e32 v9, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5623,19 +5436,19 @@ define void @v_shuffle_v4i64_v4i64__7_6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v8, v12 -; GFX942-NEXT: v_mov_b32_e32 v9, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -5650,18 +5463,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5669,18 +5483,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5688,19 +5503,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -5714,51 +5529,55 @@ define void @v_shuffle_v4i64_v4i64__7_7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] -; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -5773,18 +5592,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v10, v12 ; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5792,18 +5612,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: v_mov_b32_e32 v10, v12 ; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5811,18 +5632,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: v_mov_b32_e32 v10, v12 ; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -5836,51 +5658,54 @@ define void @v_shuffle_v4i64_v4i64__7_7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_1_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_1_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -5894,19 +5719,20 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_3_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5914,18 +5740,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v16, v14 +; GFX90A-NEXT: v_mov_b32_e32 v17, v15 +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5933,18 +5759,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v16, v14 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v17, v15 +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -5959,18 +5785,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_4_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5978,18 +5804,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5997,19 +5823,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_4_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v8 +; GFX942-NEXT: v_mov_b32_e32 v3, v9 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -6024,19 +5850,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v4 -; GFX900-NEXT: v_mov_b32_e32 v11, v5 -; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6044,19 +5869,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v4 -; GFX90A-NEXT: v_mov_b32_e32 v11, v5 -; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6064,20 +5888,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v4 -; GFX942-NEXT: v_mov_b32_e32 v11, v5 -; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v8 +; GFX942-NEXT: v_mov_b32_e32 v5, v9 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -6092,18 +5915,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_6_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6111,18 +5935,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_6_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6130,19 +5955,20 @@ define void @v_shuffle_v4i64_v4i64__7_7_6_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -6156,42 +5982,43 @@ define void @v_shuffle_v4i64_v4i64__u_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__u_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__u_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__u_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -6207,13 +6034,13 @@ define void @v_shuffle_v4i64_v4i64__0_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: v_mov_b32_e32 v2, v6 ; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6223,13 +6050,13 @@ define void @v_shuffle_v4i64_v4i64__0_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: v_mov_b32_e32 v2, v6 ; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6239,13 +6066,13 @@ define void @v_shuffle_v4i64_v4i64__0_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: v_mov_b32_e32 v2, v6 ; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -6261,11 +6088,13 @@ define void @v_shuffle_v4i64_v4i64__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6275,11 +6104,13 @@ define void @v_shuffle_v4i64_v4i64__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: v_mov_b32_e32 v4, v6 ; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6289,11 +6120,13 @@ define void @v_shuffle_v4i64_v4i64__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -6306,48 +6139,43 @@ define void @v_shuffle_v4i64_v4i64__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__2_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__2_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__2_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -6360,42 +6188,43 @@ define void @v_shuffle_v4i64_v4i64__3_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__3_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__3_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__3_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -6408,42 +6237,43 @@ define void @v_shuffle_v4i64_v4i64__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__4_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__4_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__4_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -6457,17 +6287,17 @@ define void @v_shuffle_v4i64_v4i64__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: v_mov_b32_e32 v12, v6 ; GFX900-NEXT: v_mov_b32_e32 v13, v7 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -6476,17 +6306,17 @@ define void @v_shuffle_v4i64_v4i64__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: v_mov_b32_e32 v12, v6 ; GFX90A-NEXT: v_mov_b32_e32 v13, v7 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6495,17 +6325,18 @@ define void @v_shuffle_v4i64_v4i64__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: v_mov_b32_e32 v12, v6 ; GFX942-NEXT: v_mov_b32_e32 v13, v7 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -6521,17 +6352,17 @@ define void @v_shuffle_v4i64_v4i64__6_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: v_mov_b32_e32 v14, v6 ; GFX900-NEXT: v_mov_b32_e32 v15, v7 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -6540,17 +6371,17 @@ define void @v_shuffle_v4i64_v4i64__6_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: v_mov_b32_e32 v14, v6 ; GFX90A-NEXT: v_mov_b32_e32 v15, v7 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6559,17 +6390,18 @@ define void @v_shuffle_v4i64_v4i64__6_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: v_mov_b32_e32 v14, v6 ; GFX942-NEXT: v_mov_b32_e32 v15, v7 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -6585,19 +6417,16 @@ define void @v_shuffle_v4i64_v4i64__7_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6605,19 +6434,16 @@ define void @v_shuffle_v4i64_v4i64__7_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6625,19 +6451,16 @@ define void @v_shuffle_v4i64_v4i64__7_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -6651,19 +6474,18 @@ define void @v_shuffle_v4i64_v4i64__7_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_u_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v0, v14 -; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6671,18 +6493,16 @@ define void @v_shuffle_v4i64_v4i64__7_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v0, v14 -; GFX90A-NEXT: v_mov_b32_e32 v1, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6690,18 +6510,17 @@ define void @v_shuffle_v4i64_v4i64__7_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v0, v14 -; GFX942-NEXT: v_mov_b32_e32 v1, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -6718,60 +6537,49 @@ define void @v_shuffle_v4i64_v4i64__7_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, v14 +; GFX900-NEXT: v_mov_b32_e32 v17, v15 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_0_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v16, v14 +; GFX90A-NEXT: v_mov_b32_e32 v17, v15 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v14 -; GFX90A-NEXT: v_mov_b32_e32 v3, v15 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_0_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v14 -; GFX942-NEXT: v_mov_b32_e32 v3, v15 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v14 +; GFX942-NEXT: v_mov_b32_e32 v17, v15 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -6785,57 +6593,52 @@ define void @v_shuffle_v4i64_v4i64__7_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_1_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v12 +; GFX900-NEXT: v_mov_b32_e32 v15, v13 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v0, v14 -; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_1_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v12 +; GFX90A-NEXT: v_mov_b32_e32 v15, v13 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v0, v14 -; GFX90A-NEXT: v_mov_b32_e32 v1, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_1_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v12 +; GFX942-NEXT: v_mov_b32_e32 v15, v13 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v0, v14 -; GFX942-NEXT: v_mov_b32_e32 v1, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -6849,66 +6652,52 @@ define void @v_shuffle_v4i64_v4i64__7_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_2_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v14 -; GFX90A-NEXT: v_mov_b32_e32 v3, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v14 -; GFX942-NEXT: v_mov_b32_e32 v3, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -6922,63 +6711,59 @@ define void @v_shuffle_v4i64_v4i64__7_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_4_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v0, v14 -; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v20, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[10:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v0, v14 -; GFX90A-NEXT: v_mov_b32_e32 v1, v15 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v18, v10 +; GFX90A-NEXT: v_mov_b32_e32 v19, v11 +; GFX90A-NEXT: global_store_dwordx4 v20, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v20, v[16:19], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v20, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[10:17] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v0, v14 -; GFX942-NEXT: v_mov_b32_e32 v1, v15 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: v_mov_b32_e32 v18, v10 +; GFX942-NEXT: v_mov_b32_e32 v19, v11 +; GFX942-NEXT: global_store_dwordx4 v20, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v20, v[16:19], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -6992,19 +6777,20 @@ define void @v_shuffle_v4i64_v4i64__7_5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_5_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v8, v14 -; GFX900-NEXT: v_mov_b32_e32 v9, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7012,18 +6798,18 @@ define void @v_shuffle_v4i64_v4i64__7_5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v8, v14 -; GFX90A-NEXT: v_mov_b32_e32 v9, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v16, v10 +; GFX90A-NEXT: v_mov_b32_e32 v17, v11 +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7031,18 +6817,18 @@ define void @v_shuffle_v4i64_v4i64__7_5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v8, v14 -; GFX942-NEXT: v_mov_b32_e32 v9, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v16, v10 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: v_mov_b32_e32 v17, v11 +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -7056,19 +6842,20 @@ define void @v_shuffle_v4i64_v4i64__7_6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_6_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v10, v14 -; GFX900-NEXT: v_mov_b32_e32 v11, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7076,18 +6863,18 @@ define void @v_shuffle_v4i64_v4i64__7_6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v10, v14 -; GFX90A-NEXT: v_mov_b32_e32 v11, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v16, v12 +; GFX90A-NEXT: v_mov_b32_e32 v17, v13 +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7095,18 +6882,18 @@ define void @v_shuffle_v4i64_v4i64__7_6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v10, v14 -; GFX942-NEXT: v_mov_b32_e32 v11, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v16, v12 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: v_mov_b32_e32 v17, v13 +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -7120,19 +6907,20 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7140,18 +6928,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v16, v14 +; GFX90A-NEXT: v_mov_b32_e32 v17, v15 +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7159,18 +6947,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v16, v14 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: v_mov_b32_e32 v17, v15 +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -7184,51 +6972,55 @@ define void @v_shuffle_v4i64_v4i64__7_7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -7243,18 +7035,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v14 +; GFX900-NEXT: v_mov_b32_e32 v11, v15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7262,18 +7055,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v14 +; GFX90A-NEXT: v_mov_b32_e32 v11, v15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7281,18 +7075,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v14 +; GFX942-NEXT: v_mov_b32_e32 v11, v15 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -7307,18 +7102,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7326,18 +7122,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7345,18 +7142,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_1_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v12 +; GFX942-NEXT: v_mov_b32_e32 v11, v13 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -7370,51 +7168,54 @@ define void @v_shuffle_v4i64_v4i64__7_7_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -7429,18 +7230,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_4_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v6 -; GFX900-NEXT: v_mov_b32_e32 v11, v7 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7448,18 +7249,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, v6 -; GFX90A-NEXT: v_mov_b32_e32 v11, v7 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7467,18 +7268,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_4_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v10, v6 -; GFX942-NEXT: v_mov_b32_e32 v11, v7 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v8 +; GFX942-NEXT: v_mov_b32_e32 v3, v9 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -7493,19 +7295,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_5_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v6 -; GFX900-NEXT: v_mov_b32_e32 v13, v7 -; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7513,19 +7314,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_5_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, v6 -; GFX90A-NEXT: v_mov_b32_e32 v13, v7 -; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7533,20 +7333,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_5_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v12, v6 -; GFX942-NEXT: v_mov_b32_e32 v13, v7 -; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v8 +; GFX942-NEXT: v_mov_b32_e32 v5, v9 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -7561,18 +7360,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_6_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v12 -; GFX900-NEXT: v_mov_b32_e32 v5, v13 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7580,18 +7380,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_6_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v12 -; GFX90A-NEXT: v_mov_b32_e32 v5, v13 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7599,18 +7400,20 @@ define void @v_shuffle_v4i64_v4i64__7_7_6_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v12 -; GFX942-NEXT: v_mov_b32_e32 v5, v13 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -7755,39 +7558,33 @@ define void @v_shuffle_v4i64_v4i64__3_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__3_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__3_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__3_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -7812,15 +7609,15 @@ define void @v_shuffle_v4i64_v4i64__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7828,15 +7625,15 @@ define void @v_shuffle_v4i64_v4i64__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7844,15 +7641,15 @@ define void @v_shuffle_v4i64_v4i64__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -7924,15 +7721,13 @@ define void @v_shuffle_v4i64_v4i64__7_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7942,15 +7737,13 @@ define void @v_shuffle_v4i64_v4i64__7_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7960,15 +7753,13 @@ define void @v_shuffle_v4i64_v4i64__7_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -7989,10 +7780,7 @@ define void @v_shuffle_v4i64_v4i64__7_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8006,10 +7794,7 @@ define void @v_shuffle_v4i64_v4i64__7_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8023,10 +7808,7 @@ define void @v_shuffle_v4i64_v4i64__7_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -8043,18 +7825,14 @@ define void @v_shuffle_v4i64_v4i64__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8064,19 +7842,14 @@ define void @v_shuffle_v4i64_v4i64__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8086,20 +7859,14 @@ define void @v_shuffle_v4i64_v4i64__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -8114,18 +7881,16 @@ define void @v_shuffle_v4i64_v4i64__7_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8133,18 +7898,16 @@ define void @v_shuffle_v4i64_v4i64__7_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8152,19 +7915,17 @@ define void @v_shuffle_v4i64_v4i64__7_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -8179,18 +7940,16 @@ define void @v_shuffle_v4i64_v4i64__7_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8198,18 +7957,16 @@ define void @v_shuffle_v4i64_v4i64__7_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v6 -; GFX90A-NEXT: v_mov_b32_e32 v9, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8217,19 +7974,17 @@ define void @v_shuffle_v4i64_v4i64__7_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v6 -; GFX942-NEXT: v_mov_b32_e32 v9, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -8244,18 +7999,16 @@ define void @v_shuffle_v4i64_v4i64__7_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v8 -; GFX900-NEXT: v_mov_b32_e32 v11, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8263,18 +8016,16 @@ define void @v_shuffle_v4i64_v4i64__7_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v8 -; GFX90A-NEXT: v_mov_b32_e32 v11, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8282,18 +8033,17 @@ define void @v_shuffle_v4i64_v4i64__7_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v8 -; GFX942-NEXT: v_mov_b32_e32 v11, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -8307,54 +8057,49 @@ define void @v_shuffle_v4i64_v4i64__7_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_5_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v0 -; GFX900-NEXT: v_mov_b32_e32 v9, v1 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: v_mov_b32_e32 v12, v0 -; GFX90A-NEXT: v_mov_b32_e32 v13, v1 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v12, v0 -; GFX942-NEXT: v_mov_b32_e32 v13, v1 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -8371,14 +8116,13 @@ define void @v_shuffle_v4i64_v4i64__7_6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8388,14 +8132,13 @@ define void @v_shuffle_v4i64_v4i64__7_6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8405,14 +8148,13 @@ define void @v_shuffle_v4i64_v4i64__7_6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -8429,13 +8171,13 @@ define void @v_shuffle_v4i64_v4i64__7_7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8445,13 +8187,13 @@ define void @v_shuffle_v4i64_v4i64__7_7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8461,13 +8203,13 @@ define void @v_shuffle_v4i64_v4i64__7_7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -8481,48 +8223,42 @@ define void @v_shuffle_v4i64_v4i64__7_7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -8539,14 +8275,14 @@ define void @v_shuffle_v4i64_v4i64__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8556,14 +8292,14 @@ define void @v_shuffle_v4i64_v4i64__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8573,14 +8309,14 @@ define void @v_shuffle_v4i64_v4i64__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -8597,14 +8333,14 @@ define void @v_shuffle_v4i64_v4i64__7_7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8614,14 +8350,14 @@ define void @v_shuffle_v4i64_v4i64__7_7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8631,14 +8367,14 @@ define void @v_shuffle_v4i64_v4i64__7_7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -8655,14 +8391,14 @@ define void @v_shuffle_v4i64_v4i64__7_7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v14, v12 +; GFX900-NEXT: v_mov_b32_e32 v15, v13 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8672,14 +8408,14 @@ define void @v_shuffle_v4i64_v4i64__7_7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v14, v12 +; GFX90A-NEXT: v_mov_b32_e32 v15, v13 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8689,14 +8425,14 @@ define void @v_shuffle_v4i64_v4i64__7_7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v14, v12 +; GFX942-NEXT: v_mov_b32_e32 v15, v13 +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -8713,60 +8449,48 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, v14 +; GFX900-NEXT: v_mov_b32_e32 v17, v15 +; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, v14 +; GFX90A-NEXT: v_mov_b32_e32 v17, v15 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v16, v14 +; GFX942-NEXT: v_mov_b32_e32 v17, v15 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -8783,14 +8507,13 @@ define void @v_shuffle_v4i64_v4i64__7_7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8800,14 +8523,13 @@ define void @v_shuffle_v4i64_v4i64__7_7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8817,14 +8539,13 @@ define void @v_shuffle_v4i64_v4i64__7_7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -8838,57 +8559,49 @@ define void @v_shuffle_v4i64_v4i64__7_7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_6_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_6_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_6_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v7 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -9150,15 +8863,15 @@ define void @v_shuffle_v4i64_v4i64__3_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v10 -; GFX900-NEXT: v_mov_b32_e32 v13, v11 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9169,15 +8882,15 @@ define void @v_shuffle_v4i64_v4i64__3_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, v10 -; GFX90A-NEXT: v_mov_b32_e32 v13, v11 -; GFX90A-NEXT: v_mov_b32_e32 v8, v6 -; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9188,15 +8901,15 @@ define void @v_shuffle_v4i64_v4i64__3_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 +; GFX942-NEXT: v_mov_b32_e32 v8, v10 +; GFX942-NEXT: v_mov_b32_e32 v9, v11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, v10 -; GFX942-NEXT: v_mov_b32_e32 v13, v11 -; GFX942-NEXT: v_mov_b32_e32 v8, v6 -; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -9366,13 +9079,13 @@ define void @v_shuffle_v4i64_v4i64__7_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9382,13 +9095,13 @@ define void @v_shuffle_v4i64_v4i64__7_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9398,13 +9111,13 @@ define void @v_shuffle_v4i64_v4i64__7_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 ; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -9424,10 +9137,8 @@ define void @v_shuffle_v4i64_v4i64__7_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 ; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9440,10 +9151,8 @@ define void @v_shuffle_v4i64_v4i64__7_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 ; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9456,10 +9165,8 @@ define void @v_shuffle_v4i64_v4i64__7_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 ; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 ; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -9476,18 +9183,14 @@ define void @v_shuffle_v4i64_v4i64__7_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9497,18 +9200,14 @@ define void @v_shuffle_v4i64_v4i64__7_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9518,19 +9217,14 @@ define void @v_shuffle_v4i64_v4i64__7_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -9545,18 +9239,16 @@ define void @v_shuffle_v4i64_v4i64__7_1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9564,18 +9256,16 @@ define void @v_shuffle_v4i64_v4i64__7_1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v6 -; GFX90A-NEXT: v_mov_b32_e32 v9, v7 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9583,19 +9273,17 @@ define void @v_shuffle_v4i64_v4i64__7_1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v6 -; GFX942-NEXT: v_mov_b32_e32 v9, v7 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -9610,18 +9298,16 @@ define void @v_shuffle_v4i64_v4i64__7_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v8 -; GFX900-NEXT: v_mov_b32_e32 v11, v9 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9629,18 +9315,16 @@ define void @v_shuffle_v4i64_v4i64__7_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v8 -; GFX90A-NEXT: v_mov_b32_e32 v11, v9 -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9648,19 +9332,17 @@ define void @v_shuffle_v4i64_v4i64__7_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v8 -; GFX942-NEXT: v_mov_b32_e32 v11, v9 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -9675,18 +9357,16 @@ define void @v_shuffle_v4i64_v4i64__7_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v12, v10 -; GFX900-NEXT: v_mov_b32_e32 v13, v11 -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9694,18 +9374,16 @@ define void @v_shuffle_v4i64_v4i64__7_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, v10 -; GFX90A-NEXT: v_mov_b32_e32 v13, v11 -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9713,20 +9391,19 @@ define void @v_shuffle_v4i64_v4i64__7_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v12, v10 -; GFX942-NEXT: v_mov_b32_e32 v13, v11 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] -; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() %vec1 = call <4 x i64> asm "; def $0", "=v"() %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> @@ -9741,16 +9418,13 @@ define void @v_shuffle_v4i64_v4i64__7_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9760,16 +9434,13 @@ define void @v_shuffle_v4i64_v4i64__7_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9779,16 +9450,13 @@ define void @v_shuffle_v4i64_v4i64__7_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 ; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -9805,14 +9473,13 @@ define void @v_shuffle_v4i64_v4i64__7_6_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9822,14 +9489,13 @@ define void @v_shuffle_v4i64_v4i64__7_6_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9839,14 +9505,13 @@ define void @v_shuffle_v4i64_v4i64__7_6_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -9863,14 +9528,13 @@ define void @v_shuffle_v4i64_v4i64__7_7_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9880,14 +9544,13 @@ define void @v_shuffle_v4i64_v4i64__7_7_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9897,14 +9560,13 @@ define void @v_shuffle_v4i64_v4i64__7_7_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 ; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -9918,42 +9580,42 @@ define void @v_shuffle_v4i64_v4i64__7_7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_u_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_u_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -9968,18 +9630,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v2 +; GFX900-NEXT: v_mov_b32_e32 v11, v3 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9987,18 +9650,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10006,19 +9670,20 @@ define void @v_shuffle_v4i64_v4i64__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -10033,18 +9698,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v2 +; GFX900-NEXT: v_mov_b32_e32 v11, v3 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10052,18 +9718,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10071,19 +9738,20 @@ define void @v_shuffle_v4i64_v4i64__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -10098,18 +9766,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v2 +; GFX900-NEXT: v_mov_b32_e32 v11, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10117,18 +9786,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_2_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10136,19 +9806,20 @@ define void @v_shuffle_v4i64_v4i64__7_7_2_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -10163,18 +9834,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, v14 +; GFX900-NEXT: v_mov_b32_e32 v17, v15 +; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10184,16 +9855,16 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: v_mov_b32_e32 v16, v14 +; GFX90A-NEXT: v_mov_b32_e32 v17, v15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v6 -; GFX90A-NEXT: v_mov_b32_e32 v9, v7 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10203,16 +9874,16 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v10 +; GFX942-NEXT: v_mov_b32_e32 v9, v11 +; GFX942-NEXT: v_mov_b32_e32 v16, v14 +; GFX942-NEXT: v_mov_b32_e32 v17, v15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v8, v6 -; GFX942-NEXT: v_mov_b32_e32 v9, v7 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -10226,42 +9897,42 @@ define void @v_shuffle_v4i64_v4i64__7_7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -10278,13 +9949,13 @@ define void @v_shuffle_v4i64_v4i64__7_7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10294,13 +9965,13 @@ define void @v_shuffle_v4i64_v4i64__7_7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10310,13 +9981,13 @@ define void @v_shuffle_v4i64_v4i64__7_7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -10578,15 +10249,15 @@ define void @v_shuffle_v4i64_v4i64__3_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v12 +; GFX900-NEXT: v_mov_b32_e32 v15, v13 +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, v12 -; GFX900-NEXT: v_mov_b32_e32 v15, v13 -; GFX900-NEXT: v_mov_b32_e32 v10, v6 -; GFX900-NEXT: v_mov_b32_e32 v11, v7 ; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10597,15 +10268,15 @@ define void @v_shuffle_v4i64_v4i64__3_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v12 +; GFX90A-NEXT: v_mov_b32_e32 v15, v13 +; GFX90A-NEXT: v_mov_b32_e32 v8, v12 +; GFX90A-NEXT: v_mov_b32_e32 v9, v13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, v12 -; GFX90A-NEXT: v_mov_b32_e32 v15, v13 -; GFX90A-NEXT: v_mov_b32_e32 v10, v6 -; GFX90A-NEXT: v_mov_b32_e32 v11, v7 ; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10616,15 +10287,15 @@ define void @v_shuffle_v4i64_v4i64__3_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v12 +; GFX942-NEXT: v_mov_b32_e32 v15, v13 +; GFX942-NEXT: v_mov_b32_e32 v8, v12 +; GFX942-NEXT: v_mov_b32_e32 v9, v13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, v12 -; GFX942-NEXT: v_mov_b32_e32 v15, v13 -; GFX942-NEXT: v_mov_b32_e32 v10, v6 -; GFX942-NEXT: v_mov_b32_e32 v11, v7 ; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -10794,14 +10465,13 @@ define void @v_shuffle_v4i64_v4i64__7_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10811,14 +10481,13 @@ define void @v_shuffle_v4i64_v4i64__7_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10828,14 +10497,13 @@ define void @v_shuffle_v4i64_v4i64__7_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -10855,10 +10523,8 @@ define void @v_shuffle_v4i64_v4i64__7_u_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 ; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10871,10 +10537,8 @@ define void @v_shuffle_v4i64_v4i64__7_u_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 ; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10887,10 +10551,8 @@ define void @v_shuffle_v4i64_v4i64__7_u_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 ; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -10907,18 +10569,14 @@ define void @v_shuffle_v4i64_v4i64__7_0_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10928,18 +10586,14 @@ define void @v_shuffle_v4i64_v4i64__7_0_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10949,19 +10603,14 @@ define void @v_shuffle_v4i64_v4i64__7_0_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -10976,18 +10625,16 @@ define void @v_shuffle_v4i64_v4i64__7_1_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10995,18 +10642,16 @@ define void @v_shuffle_v4i64_v4i64__7_1_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11014,19 +10659,17 @@ define void @v_shuffle_v4i64_v4i64__7_1_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -11041,18 +10684,16 @@ define void @v_shuffle_v4i64_v4i64__7_2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11060,18 +10701,16 @@ define void @v_shuffle_v4i64_v4i64__7_2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11079,19 +10718,17 @@ define void @v_shuffle_v4i64_v4i64__7_2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -11106,18 +10743,16 @@ define void @v_shuffle_v4i64_v4i64__7_3_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11125,18 +10760,16 @@ define void @v_shuffle_v4i64_v4i64__7_3_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11144,18 +10777,17 @@ define void @v_shuffle_v4i64_v4i64__7_3_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -11172,16 +10804,13 @@ define void @v_shuffle_v4i64_v4i64__7_4_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11191,16 +10820,13 @@ define void @v_shuffle_v4i64_v4i64__7_4_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11210,16 +10836,13 @@ define void @v_shuffle_v4i64_v4i64__7_4_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -11233,54 +10856,56 @@ define void @v_shuffle_v4i64_v4i64__7_5_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_5_6_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: v_mov_b32_e32 v8, v4 ; GFX900-NEXT: v_mov_b32_e32 v9, v5 ; GFX900-NEXT: v_mov_b32_e32 v10, v4 ; GFX900-NEXT: v_mov_b32_e32 v11, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 ; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_5_6_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v4 -; GFX90A-NEXT: v_mov_b32_e32 v11, v5 ; GFX90A-NEXT: v_mov_b32_e32 v12, v4 ; GFX90A-NEXT: v_mov_b32_e32 v13, v5 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v14, v4 +; GFX90A-NEXT: v_mov_b32_e32 v15, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[12:15], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_5_6_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v4 -; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v12, v4 ; GFX942-NEXT: v_mov_b32_e32 v13, v5 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v14, v4 +; GFX942-NEXT: v_mov_b32_e32 v15, v5 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[12:15], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -11297,14 +10922,13 @@ define void @v_shuffle_v4i64_v4i64__7_7_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11314,14 +10938,13 @@ define void @v_shuffle_v4i64_v4i64__7_7_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11331,14 +10954,13 @@ define void @v_shuffle_v4i64_v4i64__7_7_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -11352,45 +10974,42 @@ define void @v_shuffle_v4i64_v4i64__7_7_u_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_u_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_u_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_u_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -11405,18 +11024,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11424,18 +11044,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11443,19 +11064,20 @@ define void @v_shuffle_v4i64_v4i64__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -11470,18 +11092,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v8 -; GFX900-NEXT: v_mov_b32_e32 v5, v9 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11489,18 +11112,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v8 -; GFX90A-NEXT: v_mov_b32_e32 v5, v9 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11508,19 +11132,20 @@ define void @v_shuffle_v4i64_v4i64__7_7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v8 -; GFX942-NEXT: v_mov_b32_e32 v5, v9 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -11535,18 +11160,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_2_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v10 -; GFX900-NEXT: v_mov_b32_e32 v7, v11 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11554,18 +11180,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_2_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v10 -; GFX90A-NEXT: v_mov_b32_e32 v7, v11 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11573,19 +11200,20 @@ define void @v_shuffle_v4i64_v4i64__7_7_2_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v10 -; GFX942-NEXT: v_mov_b32_e32 v7, v11 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -11600,19 +11228,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v6 -; GFX900-NEXT: v_mov_b32_e32 v11, v7 -; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, v14 +; GFX900-NEXT: v_mov_b32_e32 v17, v15 +; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11622,17 +11249,16 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v12 +; GFX90A-NEXT: v_mov_b32_e32 v9, v13 +; GFX90A-NEXT: v_mov_b32_e32 v16, v14 +; GFX90A-NEXT: v_mov_b32_e32 v17, v15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, v6 -; GFX90A-NEXT: v_mov_b32_e32 v11, v7 -; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11642,18 +11268,16 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v12 +; GFX942-NEXT: v_mov_b32_e32 v9, v13 +; GFX942-NEXT: v_mov_b32_e32 v16, v14 +; GFX942-NEXT: v_mov_b32_e32 v17, v15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v6 -; GFX942-NEXT: v_mov_b32_e32 v11, v7 -; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -11670,13 +11294,13 @@ define void @v_shuffle_v4i64_v4i64__7_7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11686,13 +11310,13 @@ define void @v_shuffle_v4i64_v4i64__7_7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11702,13 +11326,13 @@ define void @v_shuffle_v4i64_v4i64__7_7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -11722,45 +11346,42 @@ define void @v_shuffle_v4i64_v4i64__7_7_5_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_5_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_5_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_5_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -11774,42 +11395,43 @@ define void @v_shuffle_v4i64_v4i64__u_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__u_7_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__u_7_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__u_7_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -11824,18 +11446,18 @@ define void @v_shuffle_v4i64_v4i64__0_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[10:17] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v18, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v18, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11843,18 +11465,18 @@ define void @v_shuffle_v4i64_v4i64__0_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[10:17] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v6 +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11862,19 +11484,19 @@ define void @v_shuffle_v4i64_v4i64__0_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[10:17] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: v_mov_b32_e32 v12, v6 +; GFX942-NEXT: v_mov_b32_e32 v13, v7 +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v18, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -11889,18 +11511,18 @@ define void @v_shuffle_v4i64_v4i64__1_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: v_mov_b32_e32 v4, v10 -; GFX900-NEXT: v_mov_b32_e32 v5, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11908,18 +11530,18 @@ define void @v_shuffle_v4i64_v4i64__1_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: v_mov_b32_e32 v4, v10 -; GFX90A-NEXT: v_mov_b32_e32 v5, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v6 +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11927,19 +11549,19 @@ define void @v_shuffle_v4i64_v4i64__1_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: v_mov_b32_e32 v4, v10 -; GFX942-NEXT: v_mov_b32_e32 v5, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: v_mov_b32_e32 v12, v6 +; GFX942-NEXT: v_mov_b32_e32 v13, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -11954,18 +11576,18 @@ define void @v_shuffle_v4i64_v4i64__2_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: v_mov_b32_e32 v6, v12 -; GFX900-NEXT: v_mov_b32_e32 v7, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11973,18 +11595,18 @@ define void @v_shuffle_v4i64_v4i64__2_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: v_mov_b32_e32 v6, v12 -; GFX90A-NEXT: v_mov_b32_e32 v7, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v6 +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11992,19 +11614,19 @@ define void @v_shuffle_v4i64_v4i64__2_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: v_mov_b32_e32 v6, v12 -; GFX942-NEXT: v_mov_b32_e32 v7, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: v_mov_b32_e32 v12, v6 +; GFX942-NEXT: v_mov_b32_e32 v13, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -12018,20 +11640,18 @@ define void @v_shuffle_v4i64_v4i64__3_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__3_7_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v12, v6 -; GFX900-NEXT: v_mov_b32_e32 v13, v7 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12041,17 +11661,16 @@ define void @v_shuffle_v4i64_v4i64__3_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, v14 +; GFX90A-NEXT: v_mov_b32_e32 v17, v15 +; GFX90A-NEXT: v_mov_b32_e32 v8, v14 +; GFX90A-NEXT: v_mov_b32_e32 v9, v15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, v6 -; GFX90A-NEXT: v_mov_b32_e32 v13, v7 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12061,17 +11680,16 @@ define void @v_shuffle_v4i64_v4i64__3_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v14 +; GFX942-NEXT: v_mov_b32_e32 v17, v15 +; GFX942-NEXT: v_mov_b32_e32 v8, v14 +; GFX942-NEXT: v_mov_b32_e32 v9, v15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v12, v6 -; GFX942-NEXT: v_mov_b32_e32 v13, v7 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -12088,13 +11706,13 @@ define void @v_shuffle_v4i64_v4i64__4_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: v_mov_b32_e32 v2, v6 ; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12104,13 +11722,13 @@ define void @v_shuffle_v4i64_v4i64__4_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: v_mov_b32_e32 v2, v6 ; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12120,13 +11738,13 @@ define void @v_shuffle_v4i64_v4i64__4_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: v_mov_b32_e32 v2, v6 ; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -12143,11 +11761,13 @@ define void @v_shuffle_v4i64_v4i64__5_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12157,11 +11777,13 @@ define void @v_shuffle_v4i64_v4i64__5_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: v_mov_b32_e32 v4, v6 ; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12171,11 +11793,13 @@ define void @v_shuffle_v4i64_v4i64__5_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -12189,48 +11813,43 @@ define void @v_shuffle_v4i64_v4i64__6_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__6_7_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__6_7_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__6_7_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -12244,48 +11863,43 @@ define void @v_shuffle_v4i64_v4i64__7_u_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_u_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_u_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_u_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -12299,64 +11913,61 @@ define void @v_shuffle_v4i64_v4i64__7_0_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_0_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v18, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: v_mov_b32_e32 v8, v6 ; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[10:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_0_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[10:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_0_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[10:17] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v10 +; GFX942-NEXT: v_mov_b32_e32 v9, v11 +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -12371,18 +11982,19 @@ define void @v_shuffle_v4i64_v4i64__7_1_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v8, v10 ; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12390,18 +12002,19 @@ define void @v_shuffle_v4i64_v4i64__7_1_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v8, v10 ; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12409,19 +12022,20 @@ define void @v_shuffle_v4i64_v4i64__7_1_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mov_b32_e32 v8, v10 ; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -12436,18 +12050,19 @@ define void @v_shuffle_v4i64_v4i64__7_2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12455,18 +12070,19 @@ define void @v_shuffle_v4i64_v4i64__7_2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12474,19 +12090,20 @@ define void @v_shuffle_v4i64_v4i64__7_2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v10 +; GFX942-NEXT: v_mov_b32_e32 v9, v11 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -12501,18 +12118,19 @@ define void @v_shuffle_v4i64_v4i64__7_3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12520,18 +12138,19 @@ define void @v_shuffle_v4i64_v4i64__7_3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12539,18 +12158,20 @@ define void @v_shuffle_v4i64_v4i64__7_3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v10 +; GFX942-NEXT: v_mov_b32_e32 v9, v11 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -12564,54 +12185,52 @@ define void @v_shuffle_v4i64_v4i64__7_4_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_4_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_4_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_4_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -12625,48 +12244,52 @@ define void @v_shuffle_v4i64_v4i64__7_5_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_5_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_5_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_5_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -12680,48 +12303,52 @@ define void @v_shuffle_v4i64_v4i64__7_6_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_6_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_6_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_6_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -12735,45 +12362,42 @@ define void @v_shuffle_v4i64_v4i64__7_7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_u_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_u_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_u_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -12788,18 +12412,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_0_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12807,18 +12432,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_0_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v6 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12826,19 +12452,20 @@ define void @v_shuffle_v4i64_v4i64__7_7_0_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v6 +; GFX942-NEXT: v_mov_b32_e32 v11, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -12853,18 +12480,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_1_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v10 -; GFX900-NEXT: v_mov_b32_e32 v5, v11 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12872,18 +12500,19 @@ define void @v_shuffle_v4i64_v4i64__7_7_1_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v10 -; GFX90A-NEXT: v_mov_b32_e32 v5, v11 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v6 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12891,19 +12520,20 @@ define void @v_shuffle_v4i64_v4i64__7_7_1_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v10 -; GFX942-NEXT: v_mov_b32_e32 v5, v11 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v6 +; GFX942-NEXT: v_mov_b32_e32 v11, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -12918,37 +12548,39 @@ define void @v_shuffle_v4i64_v4i64__7_7_2_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v12 -; GFX900-NEXT: v_mov_b32_e32 v7, v13 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] -; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_2_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v12 -; GFX90A-NEXT: v_mov_b32_e32 v7, v13 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v6 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12956,19 +12588,20 @@ define void @v_shuffle_v4i64_v4i64__7_7_2_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v12 -; GFX942-NEXT: v_mov_b32_e32 v7, v13 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v6 +; GFX942-NEXT: v_mov_b32_e32 v11, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -12983,19 +12616,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v6 -; GFX900-NEXT: v_mov_b32_e32 v13, v7 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v14 +; GFX900-NEXT: v_mov_b32_e32 v9, v15 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, v14 +; GFX900-NEXT: v_mov_b32_e32 v17, v15 +; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -13005,17 +12637,16 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v14 +; GFX90A-NEXT: v_mov_b32_e32 v9, v15 +; GFX90A-NEXT: v_mov_b32_e32 v16, v14 +; GFX90A-NEXT: v_mov_b32_e32 v17, v15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, v6 -; GFX90A-NEXT: v_mov_b32_e32 v13, v7 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -13025,18 +12656,16 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v14 +; GFX942-NEXT: v_mov_b32_e32 v9, v15 +; GFX942-NEXT: v_mov_b32_e32 v16, v14 +; GFX942-NEXT: v_mov_b32_e32 v17, v15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v12, v6 -; GFX942-NEXT: v_mov_b32_e32 v13, v7 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -13053,13 +12682,13 @@ define void @v_shuffle_v4i64_v4i64__7_7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v6 ; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -13069,13 +12698,13 @@ define void @v_shuffle_v4i64_v4i64__7_7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v6 ; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -13085,13 +12714,13 @@ define void @v_shuffle_v4i64_v4i64__7_7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v6 ; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -13108,11 +12737,13 @@ define void @v_shuffle_v4i64_v4i64__7_7_5_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -13122,11 +12753,13 @@ define void @v_shuffle_v4i64_v4i64__7_7_5_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v6 ; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -13136,11 +12769,13 @@ define void @v_shuffle_v4i64_v4i64__7_7_5_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -13154,45 +12789,42 @@ define void @v_shuffle_v4i64_v4i64__7_7_6_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_6_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_6_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v4i64__7_7_6_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -13328,10 +12960,9 @@ define void @s_shuffle_v4i64_v4i64__2_u_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -13474,10 +13105,9 @@ define void @s_shuffle_v4i64_v4i64__6_u_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -13750,15 +13380,14 @@ define void @s_shuffle_v4i64_v4i64__7_3_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -13895,242 +13524,435 @@ define void @s_shuffle_v4i64_v4i64__7_6_u_u() { } define void @s_shuffle_v4i64_v4i64__7_7_u_u() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_u: +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_0_u() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_0_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_1_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_u: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_u: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_1_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_0_u() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_u: +define void @s_shuffle_v4i64_v4i64__7_7_2_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_u: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_0_u: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s10 ; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_1_u() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_u: +define void @s_shuffle_v4i64_v4i64__7_7_3_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_u: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_1_u: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: s_mov_b32 s12, s14 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s10 ; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_2_u() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_u: +define void @s_shuffle_v4i64_v4i64__7_7_4_u() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_4_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_5_u() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_5_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_6_u() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_6_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s10, s14 +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_7_u() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_7_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_7_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_7_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_u: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_7_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_u: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_7_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_3_u() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_u: +define void @s_shuffle_v4i64_v4i64__7_7_7_1() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_7_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_7_2() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_7_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_7_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_7_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -14141,14 +13963,14 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_u() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_u: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_7_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -14159,3681 +13981,180 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_u() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_3_u: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_7_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_4_u() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_u: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_u: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_u: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_7_5_u() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_u: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_u: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_u: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_7_6_u() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_6_u: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s14 -; GFX9-NEXT: s_mov_b32 s9, s15 -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_7_7_u() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_7_u: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_7_u: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_7_u: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_7_7_0() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_7_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_7_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_7_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_7_7_1() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_7_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_7_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_7_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_7_7_2() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_7_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_7_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_7_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_7_7_3() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_7_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s10, s22 -; GFX900-NEXT: s_mov_b32 s11, s23 -; GFX900-NEXT: s_mov_b32 s12, s22 -; GFX900-NEXT: s_mov_b32 s13, s23 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_7_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s10, s22 -; GFX90A-NEXT: s_mov_b32 s11, s23 -; GFX90A-NEXT: s_mov_b32 s12, s22 -; GFX90A-NEXT: s_mov_b32 s13, s23 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_7_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_7_7_4() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_7_4: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s10, s22 -; GFX900-NEXT: s_mov_b32 s11, s23 -; GFX900-NEXT: s_mov_b32 s12, s22 -; GFX900-NEXT: s_mov_b32 s13, s23 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_7_4: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s10, s22 -; GFX90A-NEXT: s_mov_b32 s11, s23 -; GFX90A-NEXT: s_mov_b32 s12, s22 -; GFX90A-NEXT: s_mov_b32 s13, s23 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_7_4: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_7_7_5() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_7_5: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s12, s18 -; GFX900-NEXT: s_mov_b32 s13, s19 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_7_5: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s12, s18 -; GFX90A-NEXT: s_mov_b32 s13, s19 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_7_5: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_7_7_6() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_7_6: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s12, s18 -; GFX900-NEXT: s_mov_b32 s13, s19 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_7_6: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s12, s18 -; GFX90A-NEXT: s_mov_b32 s13, s19 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_7_6: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_7_7_7() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_7_7: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s14 -; GFX9-NEXT: s_mov_b32 s9, s15 -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 -; GFX9-NEXT: s_mov_b32 s12, s14 -; GFX9-NEXT: s_mov_b32 s13, s15 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__u_0_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__u_0_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__u_0_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__u_0_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__0_0_0_0() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__0_0_0_0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s11, s9 -; GFX9-NEXT: s_mov_b32 s12, s8 -; GFX9-NEXT: s_mov_b32 s13, s9 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_mov_b32 s15, s9 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> zeroinitializer - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__1_0_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_0_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_0_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__1_0_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__2_0_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_0_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_0_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__2_0_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__3_0_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_0_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_0_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_0_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__4_0_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__4_0_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__4_0_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__4_0_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__5_0_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_0_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_0_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_0_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__6_0_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_0_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_0_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_0_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_0_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_u_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_1_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s6 -; GFX900-NEXT: s_mov_b32 s11, s7 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s6 -; GFX90A-NEXT: s_mov_b32 s11, s7 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s2 -; GFX942-NEXT: s_mov_b32 s11, s3 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_2_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_3_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_3_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_4_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_5_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_6_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_7_0_0() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_7_u_0() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_7_1_0() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_1_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_7_2_0() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s20 -; GFX900-NEXT: s_mov_b32 s13, s21 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s20 -; GFX90A-NEXT: s_mov_b32 s13, s21 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_7_3_0() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s22 -; GFX900-NEXT: s_mov_b32 s13, s23 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s22 -; GFX90A-NEXT: s_mov_b32 s13, s23 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_3_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_7_4_0() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_7_5_0() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_7_6_0() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__u_1_1_1() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__u_1_1_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__0_1_1_1() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__0_1_1_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__1_1_1_1() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__1_1_1_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__2_1_1_1() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__2_1_1_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__3_1_1_1() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__3_1_1_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s14 -; GFX9-NEXT: s_mov_b32 s9, s15 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__4_1_1_1() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__4_1_1_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__5_1_1_1() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_1_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 -; GFX900-NEXT: s_mov_b32 s14, s10 -; GFX900-NEXT: s_mov_b32 s15, s11 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_1_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 -; GFX90A-NEXT: s_mov_b32 s14, s10 -; GFX90A-NEXT: s_mov_b32 s15, s11 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_1_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 -; GFX942-NEXT: s_mov_b32 s14, s10 -; GFX942-NEXT: s_mov_b32 s15, s11 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__6_1_1_1() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_1_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 -; GFX900-NEXT: s_mov_b32 s14, s10 -; GFX900-NEXT: s_mov_b32 s15, s11 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_1_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 -; GFX90A-NEXT: s_mov_b32 s14, s10 -; GFX90A-NEXT: s_mov_b32 s15, s11 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_1_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 -; GFX942-NEXT: s_mov_b32 s14, s10 -; GFX942-NEXT: s_mov_b32 s15, s11 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_1_1_1() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 -; GFX900-NEXT: s_mov_b32 s14, s10 -; GFX900-NEXT: s_mov_b32 s15, s11 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 -; GFX90A-NEXT: s_mov_b32 s14, s10 -; GFX90A-NEXT: s_mov_b32 s15, s11 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 -; GFX942-NEXT: s_mov_b32 s14, s10 -; GFX942-NEXT: s_mov_b32 s15, s11 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_u_1_1() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_0_1_1() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_2_1_1() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_3_1_1() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_3_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_4_1_1() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_5_1_1() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_6_1_1() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_7_1_1() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_7_u_1() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_7_0_1() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_0_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_7_2_1() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_7_3_1() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s18 -; GFX900-NEXT: s_mov_b32 s13, s19 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s18 -; GFX90A-NEXT: s_mov_b32 s13, s19 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_3_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4i64_v4i64__7_7_7_4() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_7_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_4_1() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4i64_v4i64__7_7_7_5() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_7_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_5_1() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_1: +define void @s_shuffle_v4i64_v4i64__7_7_7_6() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_7_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s18 ; GFX900-NEXT: s_mov_b32 s9, s19 ; GFX900-NEXT: s_mov_b32 s10, s18 ; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_1: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_7_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s18 ; GFX90A-NEXT: s_mov_b32 s9, s19 ; GFX90A-NEXT: s_mov_b32 s10, s18 ; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_1: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_7_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s6 +; GFX942-NEXT: s_mov_b32 s11, s7 ; GFX942-NEXT: s_mov_b32 s12, s6 ; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_6_1() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4i64_v4i64__7_7_7_7() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_7_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__u_2_2_2() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__u_2_2_2: +define void @s_shuffle_v4i64_v4i64__u_0_0_0() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__u_0_0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[12:19] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 @@ -17844,41 +14165,43 @@ define void @s_shuffle_v4i64_v4i64__u_2_2_2() { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__0_2_2_2() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__0_2_2_2: +define void @s_shuffle_v4i64_v4i64__0_0_0_0() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__0_0_0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:15] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s15, s9 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> zeroinitializer call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__1_2_2_2() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__1_2_2_2: +define void @s_shuffle_v4i64_v4i64__1_0_0_0() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__1_0_0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[12:19] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: s_mov_b32 s14, s12 @@ -17888,43 +14211,43 @@ define void @s_shuffle_v4i64_v4i64__1_2_2_2() { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__2_2_2_2() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__2_2_2_2: +define void @s_shuffle_v4i64_v4i64__2_0_0_0() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__2_0_0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__3_2_2_2() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__3_2_2_2: +define void @s_shuffle_v4i64_v4i64__3_0_0_0() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__3_0_0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[12:19] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s14 -; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: s_mov_b32 s14, s12 @@ -17934,17 +14257,17 @@ define void @s_shuffle_v4i64_v4i64__3_2_2_2() { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__4_2_2_2() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__4_2_2_2: +define void @s_shuffle_v4i64_v4i64__4_0_0_0() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__4_0_0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[12:19] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 @@ -17955,17 +14278,17 @@ define void @s_shuffle_v4i64_v4i64__4_2_2_2() { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__5_2_2_2() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_2_2_2: +define void @s_shuffle_v4i64_v4i64__5_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_0_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] @@ -17981,11 +14304,11 @@ define void @s_shuffle_v4i64_v4i64__5_2_2_2() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_2_2_2: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] @@ -18001,11 +14324,11 @@ define void @s_shuffle_v4i64_v4i64__5_2_2_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_2_2_2: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -18022,21 +14345,48 @@ define void @s_shuffle_v4i64_v4i64__5_2_2_2() { ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__6_2_2_2() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_2_2_2: +define void @s_shuffle_v4i64_v4i64__6_0_0_0() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__6_0_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: s_mov_b32 s10, s12 ; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: s_mov_b32 s14, s12 @@ -18046,15 +14396,17 @@ define void @s_shuffle_v4i64_v4i64__6_2_2_2() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_2_2_2: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: s_mov_b32 s10, s12 ; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: s_mov_b32 s14, s12 @@ -18064,17 +14416,17 @@ define void @s_shuffle_v4i64_v4i64__6_2_2_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_2_2_2: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 ; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: s_mov_b32 s14, s12 @@ -18085,25 +14437,23 @@ define void @s_shuffle_v4i64_v4i64__6_2_2_2() { ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_2_2_2() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_2_2: +define void @s_shuffle_v4i64_v4i64__7_u_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: s_mov_b32 s14, s12 ; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART @@ -18111,19 +14461,17 @@ define void @s_shuffle_v4i64_v4i64__7_2_2_2() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_2_2: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: s_mov_b32 s14, s12 ; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART @@ -18131,19 +14479,17 @@ define void @s_shuffle_v4i64_v4i64__7_2_2_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_2_2: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: s_mov_b32 s14, s12 ; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART @@ -18152,23 +14498,186 @@ define void @s_shuffle_v4i64_v4i64__7_2_2_2() { ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_1_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_0_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_2_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_0_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s16 +; GFX942-NEXT: s_mov_b32 s11, s17 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_3_0_0() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_3_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_u_2_2() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_2_2: +define void @s_shuffle_v4i64_v4i64__7_4_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: s_mov_b32 s14, s12 ; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART @@ -18176,17 +14685,19 @@ define void @s_shuffle_v4i64_v4i64__7_u_2_2() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_2_2: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: s_mov_b32 s14, s12 ; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART @@ -18194,17 +14705,19 @@ define void @s_shuffle_v4i64_v4i64__7_u_2_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_2_2: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 ; GFX942-NEXT: s_mov_b32 s14, s12 ; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART @@ -18213,57 +14726,53 @@ define void @s_shuffle_v4i64_v4i64__7_u_2_2() { ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_0_2_2() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_2_2: +define void @s_shuffle_v4i64_v4i64__7_5_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_2_2: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_2_2: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -18274,35 +14783,35 @@ define void @s_shuffle_v4i64_v4i64__7_0_2_2() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s14 ; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: s_mov_b32 s12, s0 +; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s14, s0 +; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_1_2_2() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_2_2: +define void @s_shuffle_v4i64_v4i64__7_6_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s22 ; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 ; GFX900-NEXT: s_mov_b32 s14, s12 ; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART @@ -18310,17 +14819,19 @@ define void @s_shuffle_v4i64_v4i64__7_1_2_2() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_2_2: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s22 ; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 ; GFX90A-NEXT: s_mov_b32 s14, s12 ; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART @@ -18328,17 +14839,19 @@ define void @s_shuffle_v4i64_v4i64__7_1_2_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_2_2: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 ; GFX942-NEXT: s_mov_b32 s14, s12 ; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART @@ -18347,25 +14860,48 @@ define void @s_shuffle_v4i64_v4i64__7_1_2_2() { ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_3_2_2() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_2_2: +define void @s_shuffle_v4i64_v4i64__7_7_0_0() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_u_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 ; GFX900-NEXT: s_mov_b32 s14, s12 ; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART @@ -18373,19 +14909,17 @@ define void @s_shuffle_v4i64_v4i64__7_3_2_2() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_2_2: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 ; GFX90A-NEXT: s_mov_b32 s14, s12 ; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART @@ -18393,113 +14927,165 @@ define void @s_shuffle_v4i64_v4i64__7_3_2_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_3_2_2: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_4_2_2() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_2_2: +define void @s_shuffle_v4i64_v4i64__7_7_1_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_2_2: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_2_2: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_1_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s14, s0 +; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_2_0() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_2_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[16:23] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s20 +; GFX9-NEXT: s_mov_b32 s13, s21 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_3_0() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_3_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[16:23] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s22 +; GFX9-NEXT: s_mov_b32 s13, s23 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_5_2_2() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_2_2: +define void @s_shuffle_v4i64_v4i64__7_7_4_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: s_mov_b32 s14, s16 ; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART @@ -18507,19 +15093,19 @@ define void @s_shuffle_v4i64_v4i64__7_5_2_2() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_2_2: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: s_mov_b32 s14, s16 ; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART @@ -18527,137 +15113,141 @@ define void @s_shuffle_v4i64_v4i64__7_5_2_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_2_2: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 ; GFX942-NEXT: s_mov_b32 s12, s4 ; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_6_2_2() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_2_2: +define void @s_shuffle_v4i64_v4i64__7_7_5_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_2_2: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_2_2: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_2_2() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_2: +define void @s_shuffle_v4i64_v4i64__7_7_6_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_2: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_2: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -18666,124 +15256,197 @@ define void @s_shuffle_v4i64_v4i64__7_7_2_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s14, s0 +; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__u_1_1_1() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__u_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__0_1_1_1() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__0_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__1_1_1_1() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__1_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__2_1_1_1() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__2_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__3_1_1_1() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__3_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_u_2() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4i64_v4i64__4_1_1_1() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__4_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_0_2() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_2: +define void @s_shuffle_v4i64_v4i64__5_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_2: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_1_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_0_2: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_1_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -18792,165 +15455,117 @@ define void @s_shuffle_v4i64_v4i64__7_7_0_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_1_2() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_1_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4i64_v4i64__6_1_1_1() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__6_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s14 +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_3_2() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_2: +define void @s_shuffle_v4i64_v4i64__7_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s18 -; GFX900-NEXT: s_mov_b32 s13, s19 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_2: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s18 -; GFX90A-NEXT: s_mov_b32 s13, s19 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_3_2: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_4_2() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_2: +define void @s_shuffle_v4i64_v4i64__7_u_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -18961,16 +15576,14 @@ define void @s_shuffle_v4i64_v4i64__7_7_4_2() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_2: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -18981,16 +15594,14 @@ define void @s_shuffle_v4i64_v4i64__7_7_4_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_2: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -18999,25 +15610,23 @@ define void @s_shuffle_v4i64_v4i64__7_7_4_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s18 -; GFX942-NEXT: s_mov_b32 s9, s19 -; GFX942-NEXT: s_mov_b32 s10, s18 -; GFX942-NEXT: s_mov_b32 s11, s19 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_5_2() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_2: +define void @s_shuffle_v4i64_v4i64__7_0_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -19028,16 +15637,16 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_2() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_2: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -19048,16 +15657,16 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_2: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -19066,238 +15675,131 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s18 -; GFX942-NEXT: s_mov_b32 s9, s19 -; GFX942-NEXT: s_mov_b32 s10, s18 -; GFX942-NEXT: s_mov_b32 s11, s19 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: s_mov_b32 s12, s14 ; GFX942-NEXT: s_mov_b32 s13, s15 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_6_2() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_2: +define void @s_shuffle_v4i64_v4i64__7_2_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_2: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_2: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s16 +; GFX942-NEXT: s_mov_b32 s11, s17 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__u_3_3_3() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__u_3_3_3: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 -; GFX9-NEXT: s_mov_b32 s12, s14 -; GFX9-NEXT: s_mov_b32 s13, s15 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__0_3_3_3() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__0_3_3_3: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 -; GFX9-NEXT: s_mov_b32 s12, s14 -; GFX9-NEXT: s_mov_b32 s13, s15 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__1_3_3_3() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__1_3_3_3: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 -; GFX9-NEXT: s_mov_b32 s12, s14 -; GFX9-NEXT: s_mov_b32 s13, s15 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__2_3_3_3() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__2_3_3_3: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 -; GFX9-NEXT: s_mov_b32 s12, s14 -; GFX9-NEXT: s_mov_b32 s13, s15 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__3_3_3_3() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__3_3_3_3: +define void @s_shuffle_v4i64_v4i64__7_3_1_1() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_3_1_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s14 -; GFX9-NEXT: s_mov_b32 s9, s15 -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 -; GFX9-NEXT: s_mov_b32 s12, s14 -; GFX9-NEXT: s_mov_b32 s13, s15 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__4_3_3_3() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__4_3_3_3: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[12:19] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 -; GFX9-NEXT: s_mov_b32 s12, s14 -; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__5_3_3_3() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_3_3_3: +define void @s_shuffle_v4i64_v4i64__7_4_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: s_mov_b32 s12, s14 ; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART @@ -19305,19 +15807,19 @@ define void @s_shuffle_v4i64_v4i64__5_3_3_3() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_3_3_3: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: s_mov_b32 s12, s14 ; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART @@ -19325,19 +15827,19 @@ define void @s_shuffle_v4i64_v4i64__5_3_3_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_3_3_3: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 ; GFX942-NEXT: s_mov_b32 s12, s14 ; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART @@ -19346,49 +15848,53 @@ define void @s_shuffle_v4i64_v4i64__5_3_3_3() { ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__6_3_3_3() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_3_3_3: +define void @s_shuffle_v4i64_v4i64__7_5_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_3_3_3: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_3_3_3: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -19397,37 +15903,37 @@ define void @s_shuffle_v4i64_v4i64__6_3_3_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s14, s2 +; GFX942-NEXT: s_mov_b32 s15, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_3_3_3() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_3_3: +define void @s_shuffle_v4i64_v4i64__7_6_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 ; GFX900-NEXT: s_mov_b32 s12, s14 ; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART @@ -19435,19 +15941,19 @@ define void @s_shuffle_v4i64_v4i64__7_3_3_3() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_3_3: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 ; GFX90A-NEXT: s_mov_b32 s12, s14 ; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART @@ -19455,19 +15961,19 @@ define void @s_shuffle_v4i64_v4i64__7_3_3_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_3_3_3: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 ; GFX942-NEXT: s_mov_b32 s12, s14 ; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART @@ -19476,49 +15982,224 @@ define void @s_shuffle_v4i64_v4i64__7_3_3_3() { ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_1_1() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_u_1() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_u_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_0_1() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_0_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_2_1() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_2_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s16 +; GFX9-NEXT: s_mov_b32 s13, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_3_1() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_3_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s18 +; GFX9-NEXT: s_mov_b32 s13, s19 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_4_1() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_4_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_5_1() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_5_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_u_3_3() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_3_3: +define void @s_shuffle_v4i64_v4i64__7_7_6_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_3_3: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_3_3: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -19527,67 +16208,197 @@ define void @s_shuffle_v4i64_v4i64__7_u_3_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s14, s2 +; GFX942-NEXT: s_mov_b32 s15, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_0_3_3() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_3_3: +define void @s_shuffle_v4i64_v4i64__u_2_2_2() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__u_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__0_2_2_2() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__0_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__1_2_2_2() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__1_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__2_2_2_2() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__2_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s15, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__3_2_2_2() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__3_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__4_2_2_2() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__4_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__5_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: s_mov_b32 s10, s12 ; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s18 -; GFX900-NEXT: s_mov_b32 s13, s19 -; GFX900-NEXT: s_mov_b32 s14, s18 -; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_3_3: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: s_mov_b32 s10, s12 ; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s18 -; GFX90A-NEXT: s_mov_b32 s13, s19 -; GFX90A-NEXT: s_mov_b32 s14, s18 -; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_3_3: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -19596,88 +16407,86 @@ define void @s_shuffle_v4i64_v4i64__7_0_3_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: s_mov_b32 s14, s6 -; GFX942-NEXT: s_mov_b32 s15, s7 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_1_3_3() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_3_3: +define void @s_shuffle_v4i64_v4i64__6_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_3_3: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_3_3: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_2_3_3() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_3_3: +define void @s_shuffle_v4i64_v4i64__7_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -19690,14 +16499,14 @@ define void @s_shuffle_v4i64_v4i64__7_2_3_3() { ; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: s_mov_b32 s10, s12 ; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_3_3: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -19710,14 +16519,14 @@ define void @s_shuffle_v4i64_v4i64__7_2_3_3() { ; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: s_mov_b32 s10, s12 ; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_3_3: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -19730,21 +16539,21 @@ define void @s_shuffle_v4i64_v4i64__7_2_3_3() { ; GFX942-NEXT: s_mov_b32 s9, s7 ; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_4_3_3() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_3_3: +define void @s_shuffle_v4i64_v4i64__7_u_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -19755,16 +16564,14 @@ define void @s_shuffle_v4i64_v4i64__7_4_3_3() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_3_3: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -19775,16 +16582,14 @@ define void @s_shuffle_v4i64_v4i64__7_4_3_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_3_3: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -19795,63 +16600,65 @@ define void @s_shuffle_v4i64_v4i64__7_4_3_3() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_5_3_3() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_3_3: +define void @s_shuffle_v4i64_v4i64__7_0_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s18 -; GFX900-NEXT: s_mov_b32 s13, s19 -; GFX900-NEXT: s_mov_b32 s14, s18 -; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_3_3: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s18 -; GFX90A-NEXT: s_mov_b32 s13, s19 -; GFX90A-NEXT: s_mov_b32 s14, s18 -; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_3_3: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -19862,23 +16669,25 @@ define void @s_shuffle_v4i64_v4i64__7_5_3_3() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s14 ; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: s_mov_b32 s14, s6 -; GFX942-NEXT: s_mov_b32 s15, s7 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_6_3_3() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_3_3: +define void @s_shuffle_v4i64_v4i64__7_1_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -19889,16 +16698,14 @@ define void @s_shuffle_v4i64_v4i64__7_6_3_3() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s22 ; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_3_3: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -19909,16 +16716,14 @@ define void @s_shuffle_v4i64_v4i64__7_6_3_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s22 ; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_3_3: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -19929,23 +16734,21 @@ define void @s_shuffle_v4i64_v4i64__7_6_3_3() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_3_3() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_3: +define void @s_shuffle_v4i64_v4i64__7_3_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -19956,14 +16759,16 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_3() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_3: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -19974,14 +16779,16 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_3_3: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_3_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -19992,23 +16799,23 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_3() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_u_3() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_3: +define void @s_shuffle_v4i64_v4i64__7_4_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -20019,12 +16826,16 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_3() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_3: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -20035,12 +16846,16 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_3: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -20051,126 +16866,63 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_3() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_7_0_3() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s14, s18 -; GFX900-NEXT: s_mov_b32 s15, s19 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s14, s18 -; GFX90A-NEXT: s_mov_b32 s15, s19 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_0_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s6 -; GFX942-NEXT: s_mov_b32 s15, s7 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_1_3() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_3: +define void @s_shuffle_v4i64_v4i64__7_5_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: s_mov_b32 s14, s18 -; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_3: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: s_mov_b32 s14, s18 -; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_1_3: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -20181,57 +16933,63 @@ define void @s_shuffle_v4i64_v4i64__7_7_1_3() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s14 ; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s6 -; GFX942-NEXT: s_mov_b32 s15, s7 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_2_3() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_3: +define void @s_shuffle_v4i64_v4i64__7_6_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_3: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_3: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -20242,21 +17000,23 @@ define void @s_shuffle_v4i64_v4i64__7_7_2_3() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_4_3() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_3: +define void @s_shuffle_v4i64_v4i64__7_7_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -20267,14 +17027,14 @@ define void @s_shuffle_v4i64_v4i64__7_7_4_3() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_3: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -20285,41 +17045,39 @@ define void @s_shuffle_v4i64_v4i64__7_7_4_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_3: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_5_3() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_3: +define void @s_shuffle_v4i64_v4i64__7_7_u_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -20330,14 +17088,14 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_3() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_3: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -20348,41 +17106,172 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_3: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_6_3() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_3: +define void @s_shuffle_v4i64_v4i64__7_7_0_2() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_0_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_1_2() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_1_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_3_2() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_3_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s18 +; GFX9-NEXT: s_mov_b32 s13, s19 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_4_2() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_4_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_5_2() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_5_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_6_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -20395,14 +17284,14 @@ define void @s_shuffle_v4i64_v4i64__7_7_6_3() { ; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s10, s14 ; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s14, s18 -; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_3: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -20415,14 +17304,14 @@ define void @s_shuffle_v4i64_v4i64__7_7_6_3() { ; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s10, s14 ; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s14, s18 -; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_3: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -20435,580 +17324,648 @@ define void @s_shuffle_v4i64_v4i64__7_7_6_3() { ; GFX942-NEXT: s_mov_b32 s9, s15 ; GFX942-NEXT: s_mov_b32 s10, s14 ; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s14, s6 -; GFX942-NEXT: s_mov_b32 s15, s7 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__u_4_4_4() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__u_4_4_4: +define void @s_shuffle_v4i64_v4i64__u_3_3_3() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__u_3_3_3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__0_4_4_4() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_4_4_4: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_4_4_4: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__0_4_4_4: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4i64_v4i64__0_3_3_3() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__0_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s14 +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__1_4_4_4() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_4_4_4: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_4_4_4: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__1_4_4_4: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4i64_v4i64__1_3_3_3() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__1_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__2_4_4_4() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_4_4_4: +define void @s_shuffle_v4i64_v4i64__2_3_3_3() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__2_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__3_3_3_3() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__3_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__4_3_3_3() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__4_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__5_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_4_4_4: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__2_4_4_4: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__3_4_4_4() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_4_4_4: +define void @s_shuffle_v4i64_v4i64__6_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_4_4_4: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_4_4_4: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__4_4_4_4() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__4_4_4_4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__5_4_4_4() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_4_4_4: +define void @s_shuffle_v4i64_v4i64__7_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_4_4_4: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_4_4_4: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__6_4_4_4() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_4_4_4: +define void @s_shuffle_v4i64_v4i64__7_u_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_4_4_4: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_4_4_4: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_4_4_4() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_4_4: +define void @s_shuffle_v4i64_v4i64__7_0_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_4_4: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_4_4: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 ; GFX942-NEXT: s_mov_b32 s10, s0 ; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s14, s6 +; GFX942-NEXT: s_mov_b32 s15, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_u_4_4() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_4_4: +define void @s_shuffle_v4i64_v4i64__7_1_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_4_4: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_4_4: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_0_4_4() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_4_4: +define void @s_shuffle_v4i64_v4i64__7_2_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_4_4: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_4_4: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_1_4_4() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_4_4: +define void @s_shuffle_v4i64_v4i64__7_4_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_4_4: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_4_4: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -21019,779 +17976,738 @@ define void @s_shuffle_v4i64_v4i64__7_1_4_4() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_2_4_4() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_4_4: +define void @s_shuffle_v4i64_v4i64__7_5_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_4_4: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_4_4: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[16:23] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s22 -; GFX942-NEXT: s_mov_b32 s9, s23 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s16 -; GFX942-NEXT: s_mov_b32 s13, s17 -; GFX942-NEXT: s_mov_b32 s14, s16 -; GFX942-NEXT: s_mov_b32 s15, s17 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s14, s6 +; GFX942-NEXT: s_mov_b32 s15, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_3_4_4() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_4_4: +define void @s_shuffle_v4i64_v4i64__7_6_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s22 ; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_4_4: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s22 ; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_3_4_4: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[16:23] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s22 -; GFX942-NEXT: s_mov_b32 s9, s23 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s16 -; GFX942-NEXT: s_mov_b32 s13, s17 -; GFX942-NEXT: s_mov_b32 s14, s16 -; GFX942-NEXT: s_mov_b32 s15, s17 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_5_4_4() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_4_4: +define void @s_shuffle_v4i64_v4i64__7_7_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s6 -; GFX900-NEXT: s_mov_b32 s11, s7 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_4_4: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s6 -; GFX90A-NEXT: s_mov_b32 s11, s7 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_4_4: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s2 -; GFX942-NEXT: s_mov_b32 s11, s3 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_6_4_4() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_4_4: +define void @s_shuffle_v4i64_v4i64__7_7_u_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_4_4: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_4_4: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_4_4() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_4: +define void @s_shuffle_v4i64_v4i64__7_7_0_3() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_0_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s14, s18 +; GFX9-NEXT: s_mov_b32 s15, s19 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_1_3() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_1_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s14, s18 +; GFX9-NEXT: s_mov_b32 s15, s19 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_2_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_4: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_4: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_u_4() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_4: +define void @s_shuffle_v4i64_v4i64__7_7_4_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_4: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_4: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_0_4() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_4: +define void @s_shuffle_v4i64_v4i64__7_7_5_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s10, s22 -; GFX900-NEXT: s_mov_b32 s11, s23 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_4: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s10, s22 -; GFX90A-NEXT: s_mov_b32 s11, s23 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_0_4: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s10 ; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_1_4() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_4: +define void @s_shuffle_v4i64_v4i64__7_7_6_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s10, s22 -; GFX900-NEXT: s_mov_b32 s11, s23 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_4: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s10, s22 -; GFX90A-NEXT: s_mov_b32 s11, s23 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_1_4: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s14, s6 +; GFX942-NEXT: s_mov_b32 s15, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__u_4_4_4() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__u_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_2_4() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_4: +define void @s_shuffle_v4i64_v4i64__0_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_4: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_4: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__0_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_3_4() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_4: +define void @s_shuffle_v4i64_v4i64__1_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_4: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_3_4: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__1_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[16:23] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s22 -; GFX942-NEXT: s_mov_b32 s9, s23 -; GFX942-NEXT: s_mov_b32 s10, s22 -; GFX942-NEXT: s_mov_b32 s11, s23 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: s_mov_b32 s14, s16 -; GFX942-NEXT: s_mov_b32 s15, s17 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_5_4() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_4: +define void @s_shuffle_v4i64_v4i64__2_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_4: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_4: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__2_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_6_4() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_4: +define void @s_shuffle_v4i64_v4i64__3_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s10, s22 -; GFX900-NEXT: s_mov_b32 s11, s23 -; GFX900-NEXT: s_mov_b32 s12, s20 -; GFX900-NEXT: s_mov_b32 s13, s21 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_4: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s10, s22 -; GFX90A-NEXT: s_mov_b32 s11, s23 -; GFX90A-NEXT: s_mov_b32 s12, s20 -; GFX90A-NEXT: s_mov_b32 s13, s21 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_4: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -21799,533 +18715,772 @@ define void @s_shuffle_v4i64_v4i64__7_7_6_4() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__4_4_4_4() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__4_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__5_4_4_4() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__5_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__u_5_5_5() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__u_5_5_5: +define void @s_shuffle_v4i64_v4i64__6_4_4_4() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__6_4_4_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__0_5_5_5() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_5_5_5: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_5_5_5: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__0_5_5_5: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s2 -; GFX942-NEXT: s_mov_b32 s11, s3 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4i64_v4i64__7_4_4_4() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__1_5_5_5() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_5_5_5: +define void @s_shuffle_v4i64_v4i64__7_u_4_4() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_u_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_0_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 -; GFX900-NEXT: s_mov_b32 s14, s10 -; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_5_5_5: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 -; GFX90A-NEXT: s_mov_b32 s14, s10 -; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__1_5_5_5: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 -; GFX942-NEXT: s_mov_b32 s14, s10 -; GFX942-NEXT: s_mov_b32 s15, s11 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__2_5_5_5() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_5_5_5: +define void @s_shuffle_v4i64_v4i64__7_1_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 -; GFX900-NEXT: s_mov_b32 s14, s10 -; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_5_5_5: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 -; GFX90A-NEXT: s_mov_b32 s14, s10 -; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__2_5_5_5: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 -; GFX942-NEXT: s_mov_b32 s14, s10 -; GFX942-NEXT: s_mov_b32 s15, s11 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__3_5_5_5() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_5_5_5: +define void @s_shuffle_v4i64_v4i64__7_2_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s18 ; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 -; GFX900-NEXT: s_mov_b32 s14, s10 -; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_5_5_5: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s18 ; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 -; GFX90A-NEXT: s_mov_b32 s14, s10 -; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_5_5_5: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 -; GFX942-NEXT: s_mov_b32 s14, s10 -; GFX942-NEXT: s_mov_b32 s15, s11 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__4_5_5_5() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__4_5_5_5: +define void @s_shuffle_v4i64_v4i64__7_3_4_4() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_3_4_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__5_5_5_5() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__5_5_5_5: +define void @s_shuffle_v4i64_v4i64__7_5_4_4() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_5_4_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s10, s14 +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_6_4_4() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_6_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s10, s16 +; GFX9-NEXT: s_mov_b32 s11, s17 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_4_4() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__6_5_5_5() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__6_5_5_5: +define void @s_shuffle_v4i64_v4i64__7_7_u_4() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_u_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_5_5_5() { -; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_5_5_5: +define void @s_shuffle_v4i64_v4i64__7_7_0_4() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_0_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[12:19] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s14 -; GFX9-NEXT: s_mov_b32 s9, s15 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_u_5_5() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_5_5: +define void @s_shuffle_v4i64_v4i64__7_7_1_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_5_5: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_5_5: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_1_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 ; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 ; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_0_5_5() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_5_5: +define void @s_shuffle_v4i64_v4i64__7_7_2_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_3_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: s_mov_b32 s12, s14 ; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_5_5: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: s_mov_b32 s12, s14 ; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_5_5: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_mov_b32 s12, s14 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s10 ; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: s_mov_b32 s14, s6 -; GFX942-NEXT: s_mov_b32 s15, s7 +; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_5_4() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_5_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_6_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s22 +; GFX900-NEXT: s_mov_b32 s11, s23 +; GFX900-NEXT: s_mov_b32 s12, s20 +; GFX900-NEXT: s_mov_b32 s13, s21 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s22 +; GFX90A-NEXT: s_mov_b32 s11, s23 +; GFX90A-NEXT: s_mov_b32 s12, s20 +; GFX90A-NEXT: s_mov_b32 s13, s21 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s6 +; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 +; GFX942-NEXT: s_mov_b32 s14, s0 +; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__u_5_5_5() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__u_5_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_1_5_5() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_5_5: +define void @s_shuffle_v4i64_v4i64__0_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -22334,8 +19489,8 @@ define void @s_shuffle_v4i64_v4i64__7_1_5_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 ; GFX900-NEXT: s_mov_b32 s12, s14 ; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART @@ -22343,7 +19498,7 @@ define void @s_shuffle_v4i64_v4i64__7_1_5_5() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_5_5: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -22352,8 +19507,8 @@ define void @s_shuffle_v4i64_v4i64__7_1_5_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 ; GFX90A-NEXT: s_mov_b32 s12, s14 ; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART @@ -22361,241 +19516,385 @@ define void @s_shuffle_v4i64_v4i64__7_1_5_5() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_5_5: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__0_5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_2_5_5() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_5_5: +define void @s_shuffle_v4i64_v4i64__1_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_5_5: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_5_5: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__1_5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:19] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s18 -; GFX942-NEXT: s_mov_b32 s9, s19 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_3_5_5() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_5_5: +define void @s_shuffle_v4i64_v4i64__2_5_5_5() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__2_5_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s14 +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__3_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s18 ; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_5_5: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s18 ; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_3_5_5: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:19] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s18 -; GFX942-NEXT: s_mov_b32 s9, s19 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__4_5_5_5() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__4_5_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__5_5_5_5() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__5_5_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__6_5_5_5() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__6_5_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_5_5_5() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_5_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_u_5_5() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_u_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_4_5_5() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_5_5: +define void @s_shuffle_v4i64_v4i64__7_0_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_5_5: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_5_5: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 ; GFX942-NEXT: s_mov_b32 s10, s0 ; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_6_5_5() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_5_5: +define void @s_shuffle_v4i64_v4i64__7_1_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s18 ; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: s_mov_b32 s12, s14 ; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART @@ -22603,16 +19902,17 @@ define void @s_shuffle_v4i64_v4i64__7_6_5_5() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_5_5: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s18 ; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: s_mov_b32 s12, s14 ; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART @@ -22620,138 +19920,210 @@ define void @s_shuffle_v4i64_v4i64__7_6_5_5() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_5_5: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_5_5() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_5: +define void @s_shuffle_v4i64_v4i64__7_2_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_5: +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_5: +; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) ret void } -define void @s_shuffle_v4i64_v4i64__7_7_u_5() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_5: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_5: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_5: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4i64_v4i64__7_3_5_5() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_3_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_4_5_5() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_4_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_6_5_5() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_6_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s10, s16 +; GFX9-NEXT: s_mov_b32 s11, s17 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_5_5() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_u_5() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_u_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> @@ -22760,65 +20132,23 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_5() { } define void @s_shuffle_v4i64_v4i64__7_7_0_5() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_5: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_5: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_0_5: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s6 -; GFX942-NEXT: s_mov_b32 s15, s7 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_0_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> @@ -22831,17 +20161,17 @@ define void @s_shuffle_v4i64_v4i64__7_7_1_5() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -22851,17 +20181,17 @@ define void @s_shuffle_v4i64_v4i64__7_7_1_5() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -22936,15 +20266,14 @@ define void @s_shuffle_v4i64_v4i64__7_7_2_5() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s14, s6 +; GFX942-NEXT: s_mov_b32 s15, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -23001,17 +20330,17 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_5() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:19] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s14 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s18 -; GFX942-NEXT: s_mov_b32 s9, s19 -; GFX942-NEXT: s_mov_b32 s10, s18 -; GFX942-NEXT: s_mov_b32 s11, s19 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s14, s6 +; GFX942-NEXT: s_mov_b32 s15, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -23024,118 +20353,46 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_5() { } define void @s_shuffle_v4i64_v4i64__7_7_4_5() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_5: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_5: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_5: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x i64> asm "; def $0", "=s"() - %vec1 = call <4 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v4i64__7_7_6_5() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_5: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_5: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_5: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_4_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_6_5() { +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_6_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s10, s18 +; GFX9-NEXT: s_mov_b32 s11, s19 +; GFX9-NEXT: s_mov_b32 s12, s16 +; GFX9-NEXT: s_mov_b32 s13, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> @@ -23304,17 +20561,17 @@ define void @s_shuffle_v4i64_v4i64__2_6_6_6() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -23324,17 +20581,17 @@ define void @s_shuffle_v4i64_v4i64__2_6_6_6() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -23344,17 +20601,18 @@ define void @s_shuffle_v4i64_v4i64__2_6_6_6() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -23484,14 +20742,14 @@ define void @s_shuffle_v4i64_v4i64__6_6_6_6() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s15, s9 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND @@ -23755,17 +21013,17 @@ define void @s_shuffle_v4i64_v4i64__7_3_6_6() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -23775,17 +21033,17 @@ define void @s_shuffle_v4i64_v4i64__7_3_6_6() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -23795,17 +21053,18 @@ define void @s_shuffle_v4i64_v4i64__7_3_6_6() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -23990,19 +21249,17 @@ define void @s_shuffle_v4i64_v4i64__7_7_0_6() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s22 +; GFX900-NEXT: s_mov_b32 s11, s23 +; GFX900-NEXT: s_mov_b32 s14, s20 +; GFX900-NEXT: s_mov_b32 s15, s21 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -24012,19 +21269,17 @@ define void @s_shuffle_v4i64_v4i64__7_7_0_6() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s22 +; GFX90A-NEXT: s_mov_b32 s11, s23 +; GFX90A-NEXT: s_mov_b32 s14, s20 +; GFX90A-NEXT: s_mov_b32 s15, s21 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -24039,14 +21294,12 @@ define void @s_shuffle_v4i64_v4i64__7_7_0_6() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s18 -; GFX942-NEXT: s_mov_b32 s9, s19 -; GFX942-NEXT: s_mov_b32 s10, s18 -; GFX942-NEXT: s_mov_b32 s11, s19 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s16 -; GFX942-NEXT: s_mov_b32 s15, s17 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s6 +; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -24272,58 +21525,22 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_6() { } define void @s_shuffle_v4i64_v4i64__7_7_4_6() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_6: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_6: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_6: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_4_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s10, s18 +; GFX9-NEXT: s_mov_b32 s11, s19 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> @@ -24400,12 +21617,12 @@ define void @s_shuffle_v4i64_v4i64__u_7_7_7() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 -; GFX9-NEXT: s_mov_b32 s12, s14 -; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND @@ -24489,17 +21706,17 @@ define void @s_shuffle_v4i64_v4i64__1_7_7_7() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -24509,17 +21726,17 @@ define void @s_shuffle_v4i64_v4i64__1_7_7_7() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -24529,17 +21746,18 @@ define void @s_shuffle_v4i64_v4i64__1_7_7_7() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s2 ; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -24556,17 +21774,17 @@ define void @s_shuffle_v4i64_v4i64__2_7_7_7() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s10, s18 +; GFX900-NEXT: s_mov_b32 s11, s19 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -24576,17 +21794,17 @@ define void @s_shuffle_v4i64_v4i64__2_7_7_7() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s10, s18 +; GFX90A-NEXT: s_mov_b32 s11, s19 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -24596,17 +21814,18 @@ define void @s_shuffle_v4i64_v4i64__2_7_7_7() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s10, s6 +; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s14, s6 +; GFX942-NEXT: s_mov_b32 s15, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -24623,17 +21842,17 @@ define void @s_shuffle_v4i64_v4i64__3_7_7_7() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -24643,17 +21862,17 @@ define void @s_shuffle_v4i64_v4i64__3_7_7_7() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -24665,15 +21884,16 @@ define void @s_shuffle_v4i64_v4i64__3_7_7_7() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -24712,14 +21932,14 @@ define void @s_shuffle_v4i64_v4i64__5_7_7_7() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 -; GFX9-NEXT: s_mov_b32 s12, s14 -; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND @@ -24736,14 +21956,12 @@ define void @s_shuffle_v4i64_v4i64__6_7_7_7() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 -; GFX9-NEXT: s_mov_b32 s12, s14 -; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND @@ -24983,17 +22201,17 @@ define void @s_shuffle_v4i64_v4i64__7_3_7_7() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -25003,17 +22221,17 @@ define void @s_shuffle_v4i64_v4i64__7_3_7_7() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -25023,17 +22241,18 @@ define void @s_shuffle_v4i64_v4i64__7_3_7_7() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s14, s6 +; GFX942-NEXT: s_mov_b32 s15, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -25160,12 +22379,12 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_7() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s14 -; GFX9-NEXT: s_mov_b32 s9, s15 -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND @@ -25178,65 +22397,23 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_7() { } define void @s_shuffle_v4i64_v4i64__7_7_0_7() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_7: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_7: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_0_7: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_0_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> @@ -25249,17 +22426,17 @@ define void @s_shuffle_v4i64_v4i64__7_7_1_7() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -25269,17 +22446,17 @@ define void @s_shuffle_v4i64_v4i64__7_7_1_7() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -25289,17 +22466,17 @@ define void @s_shuffle_v4i64_v4i64__7_7_1_7() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 ; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 ; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -25319,14 +22496,12 @@ define void @s_shuffle_v4i64_v4i64__7_7_2_7() { ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s10, s22 -; GFX900-NEXT: s_mov_b32 s11, s23 -; GFX900-NEXT: s_mov_b32 s14, s22 -; GFX900-NEXT: s_mov_b32 s15, s23 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -25339,14 +22514,12 @@ define void @s_shuffle_v4i64_v4i64__7_7_2_7() { ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s10, s22 -; GFX90A-NEXT: s_mov_b32 s11, s23 -; GFX90A-NEXT: s_mov_b32 s14, s22 -; GFX90A-NEXT: s_mov_b32 s15, s23 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -25358,15 +22531,14 @@ define void @s_shuffle_v4i64_v4i64__7_7_2_7() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s14, s6 -; GFX942-NEXT: s_mov_b32 s15, s7 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -25383,17 +22555,17 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_7() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s18 -; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -25403,17 +22575,17 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_7() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s18 -; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -25425,15 +22597,15 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_7() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s14 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -25446,58 +22618,22 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_7() { } define void @s_shuffle_v4i64_v4i64__7_7_4_7() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_7: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s14, s18 -; GFX900-NEXT: s_mov_b32 s15, s19 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_7: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s14, s18 -; GFX90A-NEXT: s_mov_b32 s15, s19 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_7: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s6 -; GFX942-NEXT: s_mov_b32 s15, s7 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_4_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> @@ -25506,62 +22642,22 @@ define void @s_shuffle_v4i64_v4i64__7_7_4_7() { } define void @s_shuffle_v4i64_v4i64__7_7_5_7() { -; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_7: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: s_mov_b32 s14, s18 -; GFX900-NEXT: s_mov_b32 s15, s19 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_7: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: s_mov_b32 s14, s18 -; GFX90A-NEXT: s_mov_b32 s15, s19 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_7: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s6 -; GFX942-NEXT: s_mov_b32 s15, s7 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_5_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=s"() %vec1 = call <4 x i64> asm "; def $0", "=s"() %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll index ee3b303f88471..8ffb3615940bd 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll @@ -58,39 +58,33 @@ define void @v_shuffle_v4p0_v2p0__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__1_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__1_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__1_u_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -114,39 +108,33 @@ define void @v_shuffle_v4p0_v2p0__3_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_u_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -160,55 +148,42 @@ define void @v_shuffle_v4p0_v2p0__3_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_0_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_0_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_0_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -222,49 +197,43 @@ define void @v_shuffle_v4p0_v2p0__3_1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_1_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_1_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -291,31 +260,27 @@ define void @v_shuffle_v4p0_v2p0__3_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -329,39 +294,40 @@ define void @v_shuffle_v4p0_v2p0__3_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_3_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -375,54 +341,54 @@ define void @v_shuffle_v4p0_v2p0__3_3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_0_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_3_0_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -436,57 +402,51 @@ define void @v_shuffle_v4p0_v2p0__3_3_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_1_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_3_1_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -500,45 +460,42 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_3_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -552,44 +509,47 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_3_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] -; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() %vec1 = call <2 x ptr> asm "; def $0", "=v"() %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> @@ -601,63 +561,54 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v8, v0 -; GFX900-NEXT: v_mov_b32_e32 v9, v1 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_3_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_3_3_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -671,57 +622,55 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_3_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_3_3_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -735,54 +684,52 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_3_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_3_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -796,42 +743,43 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -846,13 +794,13 @@ define void @v_shuffle_v4p0_v2p0__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -860,13 +808,13 @@ define void @v_shuffle_v4p0_v2p0__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -874,13 +822,13 @@ define void @v_shuffle_v4p0_v2p0__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -943,53 +891,47 @@ define void @v_shuffle_v4p0_v2p0__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__1_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__1_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1003,13 +945,13 @@ define void @v_shuffle_v4p0_v2p0__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1017,13 +959,13 @@ define void @v_shuffle_v4p0_v2p0__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1031,13 +973,13 @@ define void @v_shuffle_v4p0_v2p0__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1051,21 +993,16 @@ define void @v_shuffle_v4p0_v2p0__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1073,21 +1010,16 @@ define void @v_shuffle_v4p0_v2p0__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1095,22 +1027,16 @@ define void @v_shuffle_v4p0_v2p0__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1125,19 +1051,16 @@ define void @v_shuffle_v4p0_v2p0__3_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1145,19 +1068,16 @@ define void @v_shuffle_v4p0_v2p0__3_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1165,20 +1085,16 @@ define void @v_shuffle_v4p0_v2p0__3_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1192,63 +1108,58 @@ define void @v_shuffle_v4p0_v2p0__3_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_1_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v0 -; GFX900-NEXT: v_mov_b32_e32 v9, v1 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: v_mov_b32_e32 v12, v0 -; GFX90A-NEXT: v_mov_b32_e32 v13, v1 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v12, v0 -; GFX942-NEXT: v_mov_b32_e32 v13, v1 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1263,21 +1174,19 @@ define void @v_shuffle_v4p0_v2p0__3_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1285,21 +1194,19 @@ define void @v_shuffle_v4p0_v2p0__3_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1307,21 +1214,19 @@ define void @v_shuffle_v4p0_v2p0__3_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1336,19 +1241,19 @@ define void @v_shuffle_v4p0_v2p0__3_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1356,19 +1261,19 @@ define void @v_shuffle_v4p0_v2p0__3_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1376,20 +1281,19 @@ define void @v_shuffle_v4p0_v2p0__3_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1404,19 +1308,16 @@ define void @v_shuffle_v4p0_v2p0__3_3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1424,19 +1325,16 @@ define void @v_shuffle_v4p0_v2p0__3_3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1444,20 +1342,16 @@ define void @v_shuffle_v4p0_v2p0__3_3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1474,17 +1368,16 @@ define void @v_shuffle_v4p0_v2p0__3_3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1494,17 +1387,16 @@ define void @v_shuffle_v4p0_v2p0__3_3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1514,18 +1406,16 @@ define void @v_shuffle_v4p0_v2p0__3_3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1540,20 +1430,18 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: v_mov_b32_e32 v8, v0 -; GFX900-NEXT: v_mov_b32_e32 v9, v1 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1561,20 +1449,18 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1582,20 +1468,19 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1609,42 +1494,43 @@ define void @v_shuffle_v4p0_v2p0__u_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__u_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__u_1_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__u_1_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1657,49 +1543,43 @@ define void @v_shuffle_v4p0_v2p0__0_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__0_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__0_1_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__0_1_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1712,42 +1592,43 @@ define void @v_shuffle_v4p0_v2p0__1_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__1_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__1_1_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__1_1_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1760,42 +1641,43 @@ define void @v_shuffle_v4p0_v2p0__2_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__2_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__2_1_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__2_1_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1809,19 +1691,16 @@ define void @v_shuffle_v4p0_v2p0__3_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1829,19 +1708,16 @@ define void @v_shuffle_v4p0_v2p0__3_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1849,19 +1725,16 @@ define void @v_shuffle_v4p0_v2p0__3_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1876,19 +1749,16 @@ define void @v_shuffle_v4p0_v2p0__3_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1896,19 +1766,16 @@ define void @v_shuffle_v4p0_v2p0__3_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1916,19 +1783,17 @@ define void @v_shuffle_v4p0_v2p0__3_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1942,69 +1807,52 @@ define void @v_shuffle_v4p0_v2p0__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_0_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v2 -; GFX900-NEXT: v_mov_b32_e32 v9, v3 -; GFX900-NEXT: v_mov_b32_e32 v10, v2 -; GFX900-NEXT: v_mov_b32_e32 v11, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_0_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v2 -; GFX90A-NEXT: v_mov_b32_e32 v11, v3 -; GFX90A-NEXT: v_mov_b32_e32 v12, v2 -; GFX90A-NEXT: v_mov_b32_e32 v13, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_0_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v2 -; GFX942-NEXT: v_mov_b32_e32 v11, v3 -; GFX942-NEXT: v_mov_b32_e32 v12, v2 -; GFX942-NEXT: v_mov_b32_e32 v13, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2018,66 +1866,58 @@ define void @v_shuffle_v4p0_v2p0__3_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_2_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_2_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v6 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_2_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v10, v6 +; GFX942-NEXT: v_mov_b32_e32 v11, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2092,18 +1932,18 @@ define void @v_shuffle_v4p0_v2p0__3_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2111,18 +1951,18 @@ define void @v_shuffle_v4p0_v2p0__3_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2130,18 +1970,18 @@ define void @v_shuffle_v4p0_v2p0__3_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2155,51 +1995,55 @@ define void @v_shuffle_v4p0_v2p0__3_3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_u_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_3_u_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2213,51 +2057,54 @@ define void @v_shuffle_v4p0_v2p0__3_3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_0_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_3_0_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2272,18 +2119,19 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2291,18 +2139,19 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2310,18 +2159,20 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2385,39 +2236,33 @@ define void @v_shuffle_v4p0_v2p0__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__1_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__1_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__1_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2443,53 +2288,47 @@ define void @v_shuffle_v4p0_v2p0__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: v_mov_b32_e32 v6, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2503,54 +2342,43 @@ define void @v_shuffle_v4p0_v2p0__3_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_u_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_u_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_u_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2564,70 +2392,52 @@ define void @v_shuffle_v4p0_v2p0__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_0_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: v_mov_b32_e32 v8, v2 -; GFX900-NEXT: v_mov_b32_e32 v9, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_0_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: v_mov_b32_e32 v10, v2 -; GFX90A-NEXT: v_mov_b32_e32 v11, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_0_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: v_mov_b32_e32 v10, v2 -; GFX942-NEXT: v_mov_b32_e32 v11, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2642,20 +2452,16 @@ define void @v_shuffle_v4p0_v2p0__3_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 -; GFX900-NEXT: v_mov_b32_e32 v10, v4 -; GFX900-NEXT: v_mov_b32_e32 v11, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2663,20 +2469,16 @@ define void @v_shuffle_v4p0_v2p0__3_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, v4 -; GFX90A-NEXT: v_mov_b32_e32 v11, v5 -; GFX90A-NEXT: v_mov_b32_e32 v12, v4 -; GFX90A-NEXT: v_mov_b32_e32 v13, v5 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2684,21 +2486,17 @@ define void @v_shuffle_v4p0_v2p0__3_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v4 -; GFX942-NEXT: v_mov_b32_e32 v11, v5 -; GFX942-NEXT: v_mov_b32_e32 v12, v4 -; GFX942-NEXT: v_mov_b32_e32 v13, v5 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2712,54 +2510,49 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_3_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2773,48 +2566,42 @@ define void @v_shuffle_v4p0_v2p0__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_3_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v0, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2831,15 +2618,14 @@ define void @v_shuffle_v4p0_v2p0__3_3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2849,15 +2635,14 @@ define void @v_shuffle_v4p0_v2p0__3_3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2867,15 +2652,14 @@ define void @v_shuffle_v4p0_v2p0__3_3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2889,63 +2673,51 @@ define void @v_shuffle_v4p0_v2p0__3_3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_1_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_3_1_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2959,42 +2731,43 @@ define void @v_shuffle_v4p0_v2p0__u_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__u_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__u_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__u_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -3009,16 +2782,18 @@ define void @v_shuffle_v4p0_v2p0__0_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3026,16 +2801,18 @@ define void @v_shuffle_v4p0_v2p0__0_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3043,17 +2820,19 @@ define void @v_shuffle_v4p0_v2p0__0_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -3070,17 +2849,16 @@ define void @v_shuffle_v4p0_v2p0__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3090,17 +2868,16 @@ define void @v_shuffle_v4p0_v2p0__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: v_mov_b32_e32 v4, v6 ; GFX90A-NEXT: v_mov_b32_e32 v5, v7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3110,17 +2887,16 @@ define void @v_shuffle_v4p0_v2p0__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: v_mov_b32_e32 v5, v7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -3134,49 +2910,43 @@ define void @v_shuffle_v4p0_v2p0__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__2_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__2_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__2_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -3190,42 +2960,43 @@ define void @v_shuffle_v4p0_v2p0__3_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_u_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_u_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_u_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -3239,61 +3010,61 @@ define void @v_shuffle_v4p0_v2p0__3_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_0_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_0_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_0_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -3308,18 +3079,19 @@ define void @v_shuffle_v4p0_v2p0__3_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3327,18 +3099,19 @@ define void @v_shuffle_v4p0_v2p0__3_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v6 ; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3346,18 +3119,20 @@ define void @v_shuffle_v4p0_v2p0__3_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -3371,58 +3146,52 @@ define void @v_shuffle_v4p0_v2p0__3_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_2_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -3436,45 +3205,42 @@ define void @v_shuffle_v4p0_v2p0__3_3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_3_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -3489,16 +3255,19 @@ define void @v_shuffle_v4p0_v2p0__3_3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3506,16 +3275,19 @@ define void @v_shuffle_v4p0_v2p0__3_3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3523,17 +3295,20 @@ define void @v_shuffle_v4p0_v2p0__3_3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -3550,17 +3325,16 @@ define void @v_shuffle_v4p0_v2p0__3_3_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3570,17 +3344,16 @@ define void @v_shuffle_v4p0_v2p0__3_3_1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3590,18 +3363,16 @@ define void @v_shuffle_v4p0_v2p0__3_3_1_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v7 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -3615,45 +3386,42 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_3_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll index 09e497259766e..8ab9f381704e7 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll @@ -100,39 +100,33 @@ define void @v_shuffle_v4p0_v3p0__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__2_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__2_u_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -196,39 +190,33 @@ define void @v_shuffle_v4p0_v3p0__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_u_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -242,55 +230,42 @@ define void @v_shuffle_v4p0_v3p0__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_0_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_0_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_0_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -304,49 +279,43 @@ define void @v_shuffle_v4p0_v3p0__5_1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_1_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_1_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -360,49 +329,43 @@ define void @v_shuffle_v4p0_v3p0__5_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_2_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -416,45 +379,40 @@ define void @v_shuffle_v4p0_v3p0__5_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_3_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_3_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -468,39 +426,40 @@ define void @v_shuffle_v4p0_v3p0__5_4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_4_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_4_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_4_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -514,39 +473,40 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_5_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -560,51 +520,54 @@ define void @v_shuffle_v4p0_v3p0__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_0_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_5_0_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -618,51 +581,54 @@ define void @v_shuffle_v4p0_v3p0__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_1_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_5_1_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -676,57 +642,51 @@ define void @v_shuffle_v4p0_v3p0__5_5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_5_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -740,45 +700,42 @@ define void @v_shuffle_v4p0_v3p0__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -792,45 +749,42 @@ define void @v_shuffle_v4p0_v3p0__5_5_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_5_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -844,51 +798,45 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_5_5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -902,67 +850,54 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_5_5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -976,58 +911,55 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_5_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_5_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_5_5_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1041,57 +973,55 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_5_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_5_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_5_5_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1105,54 +1035,52 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_5_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v8, v0 -; GFX900-NEXT: v_mov_b32_e32 v9, v1 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_5_5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1166,51 +1094,52 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_5_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1224,42 +1153,43 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1274,13 +1204,13 @@ define void @v_shuffle_v4p0_v3p0__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1288,13 +1218,13 @@ define void @v_shuffle_v4p0_v3p0__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1302,13 +1232,13 @@ define void @v_shuffle_v4p0_v3p0__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1370,15 +1300,15 @@ define void @v_shuffle_v4p0_v3p0__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1386,17 +1316,15 @@ define void @v_shuffle_v4p0_v3p0__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1404,17 +1332,15 @@ define void @v_shuffle_v4p0_v3p0__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1430,16 +1356,13 @@ define void @v_shuffle_v4p0_v3p0__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1449,16 +1372,13 @@ define void @v_shuffle_v4p0_v3p0__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1468,16 +1388,13 @@ define void @v_shuffle_v4p0_v3p0__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1491,13 +1408,13 @@ define void @v_shuffle_v4p0_v3p0__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1505,13 +1422,13 @@ define void @v_shuffle_v4p0_v3p0__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1519,13 +1436,13 @@ define void @v_shuffle_v4p0_v3p0__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1604,20 +1521,16 @@ define void @v_shuffle_v4p0_v3p0__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1625,20 +1538,16 @@ define void @v_shuffle_v4p0_v3p0__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1646,21 +1555,16 @@ define void @v_shuffle_v4p0_v3p0__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1675,19 +1579,16 @@ define void @v_shuffle_v4p0_v3p0__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1695,19 +1596,16 @@ define void @v_shuffle_v4p0_v3p0__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1715,20 +1613,16 @@ define void @v_shuffle_v4p0_v3p0__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1742,64 +1636,58 @@ define void @v_shuffle_v4p0_v3p0__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_1_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v12, v4 +; GFX900-NEXT: v_mov_b32_e32 v13, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v12, v4 +; GFX90A-NEXT: v_mov_b32_e32 v13, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v12, v4 +; GFX942-NEXT: v_mov_b32_e32 v13, v5 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1814,19 +1702,19 @@ define void @v_shuffle_v4p0_v3p0__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1834,19 +1722,19 @@ define void @v_shuffle_v4p0_v3p0__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1854,19 +1742,19 @@ define void @v_shuffle_v4p0_v3p0__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v10 +; GFX942-NEXT: v_mov_b32_e32 v7, v11 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1881,21 +1769,19 @@ define void @v_shuffle_v4p0_v3p0__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1903,21 +1789,19 @@ define void @v_shuffle_v4p0_v3p0__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1925,21 +1809,19 @@ define void @v_shuffle_v4p0_v3p0__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1954,19 +1836,19 @@ define void @v_shuffle_v4p0_v3p0__5_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1974,19 +1856,19 @@ define void @v_shuffle_v4p0_v3p0__5_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1994,20 +1876,19 @@ define void @v_shuffle_v4p0_v3p0__5_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -2022,18 +1903,19 @@ define void @v_shuffle_v4p0_v3p0__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2041,18 +1923,19 @@ define void @v_shuffle_v4p0_v3p0__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2060,19 +1943,19 @@ define void @v_shuffle_v4p0_v3p0__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -2087,18 +1970,16 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2106,18 +1987,16 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2125,19 +2004,16 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -2152,18 +2028,18 @@ define void @v_shuffle_v4p0_v3p0__5_5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2171,18 +2047,18 @@ define void @v_shuffle_v4p0_v3p0__5_5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v6 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2190,19 +2066,18 @@ define void @v_shuffle_v4p0_v3p0__5_5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v6 +; GFX942-NEXT: v_mov_b32_e32 v11, v7 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -2217,20 +2092,18 @@ define void @v_shuffle_v4p0_v3p0__5_5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2238,20 +2111,18 @@ define void @v_shuffle_v4p0_v3p0__5_5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2259,20 +2130,18 @@ define void @v_shuffle_v4p0_v3p0__5_5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 +; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -2287,19 +2156,18 @@ define void @v_shuffle_v4p0_v3p0__5_5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2307,19 +2175,18 @@ define void @v_shuffle_v4p0_v3p0__5_5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2327,20 +2194,18 @@ define void @v_shuffle_v4p0_v3p0__5_5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v7 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -2355,21 +2220,18 @@ define void @v_shuffle_v4p0_v3p0__5_5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2377,21 +2239,18 @@ define void @v_shuffle_v4p0_v3p0__5_5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2399,22 +2258,19 @@ define void @v_shuffle_v4p0_v3p0__5_5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v7 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -2575,14 +2431,13 @@ define void @v_shuffle_v4p0_v3p0__2_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2592,14 +2447,13 @@ define void @v_shuffle_v4p0_v3p0__2_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2609,14 +2463,13 @@ define void @v_shuffle_v4p0_v3p0__2_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -2743,18 +2596,16 @@ define void @v_shuffle_v4p0_v3p0__5_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2762,18 +2613,16 @@ define void @v_shuffle_v4p0_v3p0__5_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2781,19 +2630,16 @@ define void @v_shuffle_v4p0_v3p0__5_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -2808,18 +2654,16 @@ define void @v_shuffle_v4p0_v3p0__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2827,18 +2671,16 @@ define void @v_shuffle_v4p0_v3p0__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2846,19 +2688,16 @@ define void @v_shuffle_v4p0_v3p0__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -2873,21 +2712,16 @@ define void @v_shuffle_v4p0_v3p0__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2895,21 +2729,16 @@ define void @v_shuffle_v4p0_v3p0__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2917,22 +2746,16 @@ define void @v_shuffle_v4p0_v3p0__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -2947,19 +2770,19 @@ define void @v_shuffle_v4p0_v3p0__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2967,19 +2790,19 @@ define void @v_shuffle_v4p0_v3p0__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2987,19 +2810,19 @@ define void @v_shuffle_v4p0_v3p0__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v8 +; GFX942-NEXT: v_mov_b32_e32 v7, v9 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v10 +; GFX942-NEXT: v_mov_b32_e32 v7, v11 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3014,20 +2837,19 @@ define void @v_shuffle_v4p0_v3p0__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3035,20 +2857,19 @@ define void @v_shuffle_v4p0_v3p0__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3056,20 +2877,19 @@ define void @v_shuffle_v4p0_v3p0__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3084,18 +2904,18 @@ define void @v_shuffle_v4p0_v3p0__5_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v8 -; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 ; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -3104,18 +2924,18 @@ define void @v_shuffle_v4p0_v3p0__5_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v8 -; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 ; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3124,19 +2944,18 @@ define void @v_shuffle_v4p0_v3p0__5_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v8 -; GFX942-NEXT: v_mov_b32_e32 v5, v9 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 ; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -3152,18 +2971,19 @@ define void @v_shuffle_v4p0_v3p0__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3171,18 +2991,19 @@ define void @v_shuffle_v4p0_v3p0__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3190,19 +3011,19 @@ define void @v_shuffle_v4p0_v3p0__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3216,51 +3037,55 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_u_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_5_u_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3274,51 +3099,54 @@ define void @v_shuffle_v4p0_v3p0__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_0_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_5_0_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3333,18 +3161,18 @@ define void @v_shuffle_v4p0_v3p0__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3352,18 +3180,18 @@ define void @v_shuffle_v4p0_v3p0__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3371,18 +3199,18 @@ define void @v_shuffle_v4p0_v3p0__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 +; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3397,39 +3225,37 @@ define void @v_shuffle_v4p0_v3p0__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_3_1: +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_3_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3437,20 +3263,19 @@ define void @v_shuffle_v4p0_v3p0__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v7 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3465,18 +3290,19 @@ define void @v_shuffle_v4p0_v3p0__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3484,18 +3310,19 @@ define void @v_shuffle_v4p0_v3p0__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3503,19 +3330,20 @@ define void @v_shuffle_v4p0_v3p0__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3529,42 +3357,43 @@ define void @v_shuffle_v4p0_v3p0__u_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__u_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__u_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__u_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3580,11 +3409,13 @@ define void @v_shuffle_v4p0_v3p0__0_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3594,11 +3425,13 @@ define void @v_shuffle_v4p0_v3p0__0_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3608,11 +3441,13 @@ define void @v_shuffle_v4p0_v3p0__0_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3625,49 +3460,43 @@ define void @v_shuffle_v4p0_v3p0__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__1_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__1_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v4 -; GFX90A-NEXT: v_mov_b32_e32 v11, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__1_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v4 -; GFX942-NEXT: v_mov_b32_e32 v11, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3680,42 +3509,43 @@ define void @v_shuffle_v4p0_v3p0__2_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__2_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__2_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__2_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3728,42 +3558,43 @@ define void @v_shuffle_v4p0_v3p0__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__3_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__3_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__3_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3777,17 +3608,17 @@ define void @v_shuffle_v4p0_v3p0__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 ; GFX900-NEXT: v_mov_b32_e32 v10, v4 ; GFX900-NEXT: v_mov_b32_e32 v11, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -3796,17 +3627,17 @@ define void @v_shuffle_v4p0_v3p0__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: v_mov_b32_e32 v10, v4 ; GFX90A-NEXT: v_mov_b32_e32 v11, v5 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3815,17 +3646,18 @@ define void @v_shuffle_v4p0_v3p0__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 ; GFX942-NEXT: v_mov_b32_e32 v10, v4 ; GFX942-NEXT: v_mov_b32_e32 v11, v5 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -3841,19 +3673,16 @@ define void @v_shuffle_v4p0_v3p0__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3861,19 +3690,16 @@ define void @v_shuffle_v4p0_v3p0__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3881,19 +3707,16 @@ define void @v_shuffle_v4p0_v3p0__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3908,18 +3731,16 @@ define void @v_shuffle_v4p0_v3p0__5_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3927,18 +3748,16 @@ define void @v_shuffle_v4p0_v3p0__5_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3946,18 +3765,17 @@ define void @v_shuffle_v4p0_v3p0__5_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -3971,66 +3789,52 @@ define void @v_shuffle_v4p0_v3p0__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_0_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_0_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_0_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -4044,63 +3848,52 @@ define void @v_shuffle_v4p0_v3p0__5_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_1_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_1_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -4114,63 +3907,58 @@ define void @v_shuffle_v4p0_v3p0__5_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[8:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v14, v8 +; GFX900-NEXT: v_mov_b32_e32 v15, v9 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_3_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[8:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v14, v8 +; GFX90A-NEXT: v_mov_b32_e32 v15, v9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_3_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[8:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_mov_b32_e32 v14, v8 +; GFX942-NEXT: v_mov_b32_e32 v15, v9 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -4185,18 +3973,18 @@ define void @v_shuffle_v4p0_v3p0__5_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v6, v10 -; GFX900-NEXT: v_mov_b32_e32 v7, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v12, v8 +; GFX900-NEXT: v_mov_b32_e32 v13, v9 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4204,18 +3992,18 @@ define void @v_shuffle_v4p0_v3p0__5_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v10 -; GFX90A-NEXT: v_mov_b32_e32 v7, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v12, v8 +; GFX90A-NEXT: v_mov_b32_e32 v13, v9 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4223,18 +4011,18 @@ define void @v_shuffle_v4p0_v3p0__5_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v6, v10 -; GFX942-NEXT: v_mov_b32_e32 v7, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v12, v8 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_mov_b32_e32 v13, v9 +; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -4249,18 +4037,18 @@ define void @v_shuffle_v4p0_v3p0__5_5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4268,18 +4056,18 @@ define void @v_shuffle_v4p0_v3p0__5_5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4287,18 +4075,18 @@ define void @v_shuffle_v4p0_v3p0__5_5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 +; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -4312,51 +4100,55 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_5_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -4371,18 +4163,19 @@ define void @v_shuffle_v4p0_v3p0__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: v_mov_b32_e32 v8, v10 ; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4390,18 +4183,19 @@ define void @v_shuffle_v4p0_v3p0__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: v_mov_b32_e32 v8, v10 ; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4409,18 +4203,19 @@ define void @v_shuffle_v4p0_v3p0__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 ; GFX942-NEXT: v_mov_b32_e32 v8, v10 ; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -4434,51 +4229,54 @@ define void @v_shuffle_v4p0_v3p0__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_1_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_5_1_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -4493,19 +4291,18 @@ define void @v_shuffle_v4p0_v3p0__5_5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4513,19 +4310,18 @@ define void @v_shuffle_v4p0_v3p0__5_5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4533,20 +4329,19 @@ define void @v_shuffle_v4p0_v3p0__5_5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v7 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -4561,18 +4356,19 @@ define void @v_shuffle_v4p0_v3p0__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4580,18 +4376,19 @@ define void @v_shuffle_v4p0_v3p0__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4599,18 +4396,20 @@ define void @v_shuffle_v4p0_v3p0__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -4716,39 +4515,33 @@ define void @v_shuffle_v4p0_v3p0__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__2_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__2_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__2_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -4773,15 +4566,15 @@ define void @v_shuffle_v4p0_v3p0__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4789,17 +4582,15 @@ define void @v_shuffle_v4p0_v3p0__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4807,17 +4598,15 @@ define void @v_shuffle_v4p0_v3p0__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -4834,16 +4623,13 @@ define void @v_shuffle_v4p0_v3p0__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4853,16 +4639,13 @@ define void @v_shuffle_v4p0_v3p0__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4872,16 +4655,13 @@ define void @v_shuffle_v4p0_v3p0__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -4902,10 +4682,7 @@ define void @v_shuffle_v4p0_v3p0__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4919,10 +4696,7 @@ define void @v_shuffle_v4p0_v3p0__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4936,10 +4710,7 @@ define void @v_shuffle_v4p0_v3p0__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -4956,19 +4727,14 @@ define void @v_shuffle_v4p0_v3p0__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4978,19 +4744,14 @@ define void @v_shuffle_v4p0_v3p0__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5000,20 +4761,14 @@ define void @v_shuffle_v4p0_v3p0__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -5028,18 +4783,16 @@ define void @v_shuffle_v4p0_v3p0__5_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5047,18 +4800,16 @@ define void @v_shuffle_v4p0_v3p0__5_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5066,19 +4817,17 @@ define void @v_shuffle_v4p0_v3p0__5_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -5093,18 +4842,16 @@ define void @v_shuffle_v4p0_v3p0__5_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5112,18 +4859,16 @@ define void @v_shuffle_v4p0_v3p0__5_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v6 -; GFX90A-NEXT: v_mov_b32_e32 v9, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5131,18 +4876,17 @@ define void @v_shuffle_v4p0_v3p0__5_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v6 -; GFX942-NEXT: v_mov_b32_e32 v9, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -5156,54 +4900,49 @@ define void @v_shuffle_v4p0_v3p0__5_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_4_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: v_mov_b32_e32 v8, v0 -; GFX900-NEXT: v_mov_b32_e32 v9, v1 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -5220,14 +4959,13 @@ define void @v_shuffle_v4p0_v3p0__5_5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5237,14 +4975,13 @@ define void @v_shuffle_v4p0_v3p0__5_5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5254,14 +4991,13 @@ define void @v_shuffle_v4p0_v3p0__5_5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -5275,51 +5011,42 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_5_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -5336,14 +5063,14 @@ define void @v_shuffle_v4p0_v3p0__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5353,14 +5080,14 @@ define void @v_shuffle_v4p0_v3p0__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5370,14 +5097,14 @@ define void @v_shuffle_v4p0_v3p0__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -5394,14 +5121,14 @@ define void @v_shuffle_v4p0_v3p0__5_5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5411,14 +5138,14 @@ define void @v_shuffle_v4p0_v3p0__5_5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5428,14 +5155,14 @@ define void @v_shuffle_v4p0_v3p0__5_5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -5449,63 +5176,51 @@ define void @v_shuffle_v4p0_v3p0__5_5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_5_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -5519,54 +5234,49 @@ define void @v_shuffle_v4p0_v3p0__5_5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_4_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: v_mov_b32_e32 v8, v0 -; GFX900-NEXT: v_mov_b32_e32 v9, v1 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_5_4_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -5763,15 +5473,15 @@ define void @v_shuffle_v4p0_v3p0__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v8 -; GFX900-NEXT: v_mov_b32_e32 v11, v9 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 ; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5782,15 +5492,15 @@ define void @v_shuffle_v4p0_v3p0__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, v8 -; GFX90A-NEXT: v_mov_b32_e32 v11, v9 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5801,15 +5511,15 @@ define void @v_shuffle_v4p0_v3p0__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: v_mov_b32_e32 v6, v8 +; GFX942-NEXT: v_mov_b32_e32 v7, v9 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v8 -; GFX942-NEXT: v_mov_b32_e32 v11, v9 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 ; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -5924,14 +5634,13 @@ define void @v_shuffle_v4p0_v3p0__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5941,14 +5650,13 @@ define void @v_shuffle_v4p0_v3p0__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5958,14 +5666,13 @@ define void @v_shuffle_v4p0_v3p0__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -5986,10 +5693,7 @@ define void @v_shuffle_v4p0_v3p0__5_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6003,10 +5707,7 @@ define void @v_shuffle_v4p0_v3p0__5_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6020,10 +5721,7 @@ define void @v_shuffle_v4p0_v3p0__5_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -6040,19 +5738,14 @@ define void @v_shuffle_v4p0_v3p0__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6062,19 +5755,14 @@ define void @v_shuffle_v4p0_v3p0__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6084,20 +5772,14 @@ define void @v_shuffle_v4p0_v3p0__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -6112,18 +5794,16 @@ define void @v_shuffle_v4p0_v3p0__5_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6131,18 +5811,16 @@ define void @v_shuffle_v4p0_v3p0__5_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6150,19 +5828,17 @@ define void @v_shuffle_v4p0_v3p0__5_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -6177,18 +5853,16 @@ define void @v_shuffle_v4p0_v3p0__5_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6196,18 +5870,16 @@ define void @v_shuffle_v4p0_v3p0__5_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6215,18 +5887,17 @@ define void @v_shuffle_v4p0_v3p0__5_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -6240,60 +5911,56 @@ define void @v_shuffle_v4p0_v3p0__5_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_3_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v6, v2 ; GFX900-NEXT: v_mov_b32_e32 v7, v3 ; GFX900-NEXT: v_mov_b32_e32 v8, v2 ; GFX900-NEXT: v_mov_b32_e32 v9, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 ; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_3_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 ; GFX90A-NEXT: v_mov_b32_e32 v10, v2 ; GFX90A-NEXT: v_mov_b32_e32 v11, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v12, v2 +; GFX90A-NEXT: v_mov_b32_e32 v13, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_3_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v10, v2 ; GFX942-NEXT: v_mov_b32_e32 v11, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v12, v2 +; GFX942-NEXT: v_mov_b32_e32 v13, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -6310,14 +5977,13 @@ define void @v_shuffle_v4p0_v3p0__5_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6327,14 +5993,13 @@ define void @v_shuffle_v4p0_v3p0__5_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6344,14 +6009,13 @@ define void @v_shuffle_v4p0_v3p0__5_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -6365,45 +6029,42 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_5_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -6418,18 +6079,19 @@ define void @v_shuffle_v4p0_v3p0__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6437,18 +6099,19 @@ define void @v_shuffle_v4p0_v3p0__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6456,19 +6119,20 @@ define void @v_shuffle_v4p0_v3p0__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -6483,18 +6147,19 @@ define void @v_shuffle_v4p0_v3p0__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6502,18 +6167,19 @@ define void @v_shuffle_v4p0_v3p0__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6521,19 +6187,20 @@ define void @v_shuffle_v4p0_v3p0__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -6550,17 +6217,16 @@ define void @v_shuffle_v4p0_v3p0__5_5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6570,17 +6236,16 @@ define void @v_shuffle_v4p0_v3p0__5_5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6590,18 +6255,16 @@ define void @v_shuffle_v4p0_v3p0__5_5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v8 +; GFX942-NEXT: v_mov_b32_e32 v7, v9 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -6615,45 +6278,42 @@ define void @v_shuffle_v4p0_v3p0__5_5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -6667,42 +6327,43 @@ define void @v_shuffle_v4p0_v3p0__u_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__u_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__u_5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__u_5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -6717,18 +6378,18 @@ define void @v_shuffle_v4p0_v3p0__0_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[8:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6736,18 +6397,18 @@ define void @v_shuffle_v4p0_v3p0__0_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[8:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6755,19 +6416,19 @@ define void @v_shuffle_v4p0_v3p0__0_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[8:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -6782,18 +6443,18 @@ define void @v_shuffle_v4p0_v3p0__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v8 -; GFX900-NEXT: v_mov_b32_e32 v5, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6801,18 +6462,18 @@ define void @v_shuffle_v4p0_v3p0__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v8 -; GFX90A-NEXT: v_mov_b32_e32 v5, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6820,19 +6481,19 @@ define void @v_shuffle_v4p0_v3p0__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v8 -; GFX942-NEXT: v_mov_b32_e32 v5, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -6849,17 +6510,16 @@ define void @v_shuffle_v4p0_v3p0__2_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6869,17 +6529,16 @@ define void @v_shuffle_v4p0_v3p0__2_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6889,17 +6548,16 @@ define void @v_shuffle_v4p0_v3p0__2_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 +; GFX942-NEXT: v_mov_b32_e32 v6, v10 +; GFX942-NEXT: v_mov_b32_e32 v7, v11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -6916,11 +6574,13 @@ define void @v_shuffle_v4p0_v3p0__3_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6930,11 +6590,13 @@ define void @v_shuffle_v4p0_v3p0__3_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6944,11 +6606,13 @@ define void @v_shuffle_v4p0_v3p0__3_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -6962,49 +6626,43 @@ define void @v_shuffle_v4p0_v3p0__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__4_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__4_5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v4 -; GFX90A-NEXT: v_mov_b32_e32 v11, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__4_5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v4 -; GFX942-NEXT: v_mov_b32_e32 v11, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -7018,48 +6676,43 @@ define void @v_shuffle_v4p0_v3p0__5_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_u_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_u_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_u_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -7073,64 +6726,61 @@ define void @v_shuffle_v4p0_v3p0__5_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_0_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[8:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_0_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[8:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_0_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[8:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v8 +; GFX942-NEXT: v_mov_b32_e32 v7, v9 +; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -7145,18 +6795,19 @@ define void @v_shuffle_v4p0_v3p0__5_1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v6, v8 ; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7164,18 +6815,19 @@ define void @v_shuffle_v4p0_v3p0__5_1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v6, v8 ; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7183,19 +6835,20 @@ define void @v_shuffle_v4p0_v3p0__5_1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mov_b32_e32 v6, v8 ; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -7210,18 +6863,19 @@ define void @v_shuffle_v4p0_v3p0__5_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7229,18 +6883,19 @@ define void @v_shuffle_v4p0_v3p0__5_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7248,18 +6903,20 @@ define void @v_shuffle_v4p0_v3p0__5_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:11] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v8 +; GFX942-NEXT: v_mov_b32_e32 v7, v9 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -7273,51 +6930,52 @@ define void @v_shuffle_v4p0_v3p0__5_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_3_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_3_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_3_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -7331,54 +6989,52 @@ define void @v_shuffle_v4p0_v3p0__5_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_4_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_4_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v4 -; GFX90A-NEXT: v_mov_b32_e32 v11, v5 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_4_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v4 -; GFX942-NEXT: v_mov_b32_e32 v11, v5 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -7392,45 +7048,42 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_u_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_5_u_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -7445,18 +7098,19 @@ define void @v_shuffle_v4p0_v3p0__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7464,18 +7118,19 @@ define void @v_shuffle_v4p0_v3p0__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7483,19 +7138,20 @@ define void @v_shuffle_v4p0_v3p0__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:7] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -7510,18 +7166,19 @@ define void @v_shuffle_v4p0_v3p0__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v8 -; GFX900-NEXT: v_mov_b32_e32 v5, v9 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7529,18 +7186,19 @@ define void @v_shuffle_v4p0_v3p0__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v8 -; GFX90A-NEXT: v_mov_b32_e32 v5, v9 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7548,19 +7206,20 @@ define void @v_shuffle_v4p0_v3p0__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:5] +; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:9] +; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v8 -; GFX942-NEXT: v_mov_b32_e32 v5, v9 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -7577,17 +7236,16 @@ define void @v_shuffle_v4p0_v3p0__5_5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7597,17 +7255,16 @@ define void @v_shuffle_v4p0_v3p0__5_5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7617,18 +7274,16 @@ define void @v_shuffle_v4p0_v3p0__5_5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v10 +; GFX942-NEXT: v_mov_b32_e32 v7, v11 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -7645,11 +7300,13 @@ define void @v_shuffle_v4p0_v3p0__5_5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7659,11 +7316,13 @@ define void @v_shuffle_v4p0_v3p0__5_5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7673,11 +7332,13 @@ define void @v_shuffle_v4p0_v3p0__5_5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -7691,45 +7352,42 @@ define void @v_shuffle_v4p0_v3p0__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__5_5_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -7865,10 +7523,9 @@ define void @s_shuffle_v4p0_v3p0__2_u_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -7966,10 +7623,9 @@ define void @s_shuffle_v4p0_v3p0__5_u_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -7986,15 +7642,13 @@ define void @s_shuffle_v4p0_v3p0__5_0_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -8004,15 +7658,13 @@ define void @s_shuffle_v4p0_v3p0__5_0_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -8075,11 +7727,11 @@ define void @s_shuffle_v4p0_v3p0__5_1_u_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -8130,13 +7782,11 @@ define void @s_shuffle_v4p0_v3p0__5_2_u_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -8149,46 +7799,18 @@ define void @s_shuffle_v4p0_v3p0__5_2_u_u() { } define void @s_shuffle_v4p0_v3p0__5_3_u_u() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_3_u_u: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_3_u_u: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_3_u_u: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_3_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> @@ -8201,10 +7823,10 @@ define void @s_shuffle_v4p0_v3p0__5_4_u_u() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND @@ -8217,50 +7839,18 @@ define void @s_shuffle_v4p0_v3p0__5_4_u_u() { } define void @s_shuffle_v4p0_v3p0__5_5_u_u() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_u_u: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_u_u: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_u_u: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> @@ -8269,65 +7859,21 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_u() { } define void @s_shuffle_v4p0_v3p0__5_5_0_u() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_0_u: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_0_u: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_0_u: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_0_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> @@ -8340,17 +7886,15 @@ define void @s_shuffle_v4p0_v3p0__5_5_1_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -8360,17 +7904,15 @@ define void @s_shuffle_v4p0_v3p0__5_5_1_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -8380,16 +7922,14 @@ define void @s_shuffle_v4p0_v3p0__5_5_1_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 ; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] @@ -8410,12 +7950,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_2_u() { ; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -8428,12 +7966,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_2_u() { ; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -8445,13 +7981,12 @@ define void @s_shuffle_v4p0_v3p0__5_5_2_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -8464,52 +7999,20 @@ define void @s_shuffle_v4p0_v3p0__5_5_2_u() { } define void @s_shuffle_v4p0_v3p0__5_5_3_u() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_3_u: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_3_u: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_3_u: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_3_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> @@ -8518,56 +8021,20 @@ define void @s_shuffle_v4p0_v3p0__5_5_3_u() { } define void @s_shuffle_v4p0_v3p0__5_5_4_u() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_4_u: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_4_u: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_4_u: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_4_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> @@ -8580,12 +8047,12 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_u() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND @@ -8602,17 +8069,17 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[16:21] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -8622,17 +8089,17 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[16:21] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -8642,16 +8109,16 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: s_mov_b32 s14, s0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: s_mov_b32 s12, s8 +; GFX942-NEXT: s_mov_b32 s13, s9 ; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] @@ -8665,65 +8132,23 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_0() { } define void @s_shuffle_v4p0_v3p0__5_5_5_1() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_5_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_5_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_5_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_5_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> @@ -8732,65 +8157,25 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_1() { } define void @s_shuffle_v4p0_v3p0__5_5_5_2() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_5_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_5_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_5_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_5_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> @@ -8799,62 +8184,22 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_2() { } define void @s_shuffle_v4p0_v3p0__5_5_5_3() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_5_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s12, s20 -; GFX900-NEXT: s_mov_b32 s13, s21 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_5_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s12, s20 -; GFX90A-NEXT: s_mov_b32 s13, s21 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_5_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_5_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> @@ -8863,58 +8208,22 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_3() { } define void @s_shuffle_v4p0_v3p0__5_5_5_4() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_5_4: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_5_4: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_5_4: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_5_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> @@ -8927,14 +8236,14 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_5() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s15, s9 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND @@ -8947,56 +8256,20 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_5() { } define void @s_shuffle_v4p0_v3p0__u_0_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__u_0_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__u_0_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__u_0_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4p0_v3p0__u_0_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) @@ -9027,172 +8300,427 @@ define void @s_shuffle_v4p0_v3p0__0_0_0_0() { } define void @s_shuffle_v4p0_v3p0__1_0_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__1_0_0_0: +; GFX9-LABEL: s_shuffle_v4p0_v3p0__1_0_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__2_0_0_0() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__2_0_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__3_0_0_0() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_0_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__4_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__4_0_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__1_0_0_0: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__4_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__1_0_0_0: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__4_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:17] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s2 ; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__2_0_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__2_0_0_0: +define void @s_shuffle_v4p0_v3p0__5_0_0_0() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_0_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_u_0_0() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_u_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_1_0_0() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_1_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s14 +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_2_0_0() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_2_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s16 +; GFX9-NEXT: s_mov_b32 s11, s17 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_3_0_0() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_3_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_4_0_0() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_4_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_0_0() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_u_0() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__2_0_0_0: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__2_0_0_0: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 ; GFX942-NEXT: s_mov_b32 s14, s0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 ; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__3_0_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__3_0_0_0: +define void @s_shuffle_v4p0_v3p0__5_5_1_0() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__3_0_0_0: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__3_0_0_0: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_1_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: s_mov_b32 s14, s0 ; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART @@ -9200,72 +8728,94 @@ define void @s_shuffle_v4p0_v3p0__3_0_0_0() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__4_0_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__4_0_0_0: +define void @s_shuffle_v4p0_v3p0__5_5_2_0() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_2_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[16:21] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s20 +; GFX9-NEXT: s_mov_b32 s13, s21 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_3_0() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[16:21] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 ; GFX900-NEXT: s_mov_b32 s12, s4 ; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__4_0_0_0: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_3_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[16:21] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 ; GFX90A-NEXT: s_mov_b32 s12, s4 ; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__4_0_0_0: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_3_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s14, s0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 ; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] @@ -9273,70 +8823,66 @@ define void @s_shuffle_v4p0_v3p0__4_0_0_0() { ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_0_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_0_0_0: +define void @s_shuffle_v4p0_v3p0__5_5_4_0() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[16:21] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_0_0_0: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_4_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[16:21] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_0_0_0: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_4_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s14, s0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 ; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] @@ -9344,3422 +8890,640 @@ define void @s_shuffle_v4p0_v3p0__5_0_0_0() { ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_u_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_u_0_0: +define void @s_shuffle_v4p0_v3p0__u_1_1_1() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__u_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__0_1_1_1() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__0_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__1_1_1_1() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__1_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__2_1_1_1() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__2_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__3_1_1_1() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__4_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__4_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_u_0_0: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__4_1_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_u_0_0: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__4_1_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_1_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_1_0_0: +define void @s_shuffle_v4p0_v3p0__5_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s6 -; GFX900-NEXT: s_mov_b32 s11, s7 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_1_0_0: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_1_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s6 -; GFX90A-NEXT: s_mov_b32 s11, s7 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_1_0_0: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_1_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s2 +; GFX942-NEXT: s_mov_b32 s12, s10 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s11, s3 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_2_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_2_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_2_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_2_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4p0_v3p0__5_u_1_1() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_u_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_3_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_3_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_3_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_3_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4p0_v3p0__5_0_1_1() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_0_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_4_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_4_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_4_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_4_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4p0_v3p0__5_2_1_1() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_2_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s16 +; GFX9-NEXT: s_mov_b32 s11, s17 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_5_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4p0_v3p0__5_3_1_1() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_3_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_5_u_0() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_u_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_u_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_u_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4p0_v3p0__5_4_1_1() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_4_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_5_1_0() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_1_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_1_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_1_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4p0_v3p0__5_5_1_1() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_5_2_0() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_2_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s20 -; GFX900-NEXT: s_mov_b32 s13, s21 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_2_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s20 -; GFX90A-NEXT: s_mov_b32 s13, s21 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_2_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4p0_v3p0__5_5_u_1() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_u_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_5_3_0() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_3_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_3_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_3_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:17] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s16 -; GFX942-NEXT: s_mov_b32 s9, s17 -; GFX942-NEXT: s_mov_b32 s10, s16 -; GFX942-NEXT: s_mov_b32 s11, s17 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4p0_v3p0__5_5_0_1() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_0_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_5_4_0() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_4_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_4_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_4_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:17] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s16 -; GFX942-NEXT: s_mov_b32 s9, s17 -; GFX942-NEXT: s_mov_b32 s10, s16 -; GFX942-NEXT: s_mov_b32 s11, s17 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4p0_v3p0__5_5_2_1() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_2_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s16 +; GFX9-NEXT: s_mov_b32 s13, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__u_1_1_1() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__u_1_1_1: +define void @s_shuffle_v4p0_v3p0__5_5_3_1() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_3_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[12:17] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__0_1_1_1() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__0_1_1_1: +define void @s_shuffle_v4p0_v3p0__5_5_4_1() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_4_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[12:17] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__1_1_1_1() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__1_1_1_1: +define void @s_shuffle_v4p0_v3p0__u_2_2_2() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__u_2_2_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:13] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__2_1_1_1() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__2_1_1_1: +define void @s_shuffle_v4p0_v3p0__0_2_2_2() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__0_2_2_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:13] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__3_1_1_1() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_1_1_1: +define void @s_shuffle_v4p0_v3p0__1_2_2_2() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__1_2_2_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:13] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__4_1_1_1() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__4_1_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 -; GFX900-NEXT: s_mov_b32 s14, s10 -; GFX900-NEXT: s_mov_b32 s15, s11 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__4_1_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 -; GFX90A-NEXT: s_mov_b32 s14, s10 -; GFX90A-NEXT: s_mov_b32 s15, s11 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__4_1_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 -; GFX942-NEXT: s_mov_b32 s14, s10 -; GFX942-NEXT: s_mov_b32 s15, s11 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_1_1_1() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_1_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 -; GFX900-NEXT: s_mov_b32 s14, s10 -; GFX900-NEXT: s_mov_b32 s15, s11 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_1_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 -; GFX90A-NEXT: s_mov_b32 s14, s10 -; GFX90A-NEXT: s_mov_b32 s15, s11 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_1_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 -; GFX942-NEXT: s_mov_b32 s14, s10 -; GFX942-NEXT: s_mov_b32 s15, s11 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_u_1_1() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_u_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_u_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_u_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_0_1_1() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_0_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_0_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_0_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_2_1_1() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_2_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_2_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_2_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_3_1_1() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_3_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_3_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_3_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_4_1_1() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_4_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_4_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_4_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_5_1_1() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_5_u_1() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_u_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_u_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_u_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_5_0_1() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_0_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_0_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_0_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_5_2_1() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_2_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_2_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_2_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_5_3_1() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_3_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_3_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_3_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:17] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s16 -; GFX942-NEXT: s_mov_b32 s9, s17 -; GFX942-NEXT: s_mov_b32 s10, s16 -; GFX942-NEXT: s_mov_b32 s11, s17 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_5_4_1() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_4_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_4_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_4_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:17] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s16 -; GFX942-NEXT: s_mov_b32 s9, s17 -; GFX942-NEXT: s_mov_b32 s10, s16 -; GFX942-NEXT: s_mov_b32 s11, s17 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__u_2_2_2() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__u_2_2_2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__0_2_2_2() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__0_2_2_2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__1_2_2_2() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__1_2_2_2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__2_2_2_2() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__2_2_2_2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__3_2_2_2() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_2_2_2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__4_2_2_2() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__4_2_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__4_2_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__4_2_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_2_2_2() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_2_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_2_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_2_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_u_2_2() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_u_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_u_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_u_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_0_2_2() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_0_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_0_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_0_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_1_2_2() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_1_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_1_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_1_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_3_2_2() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_3_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_3_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_3_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_4_2_2() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_4_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_4_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_4_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_5_2_2() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_5_u_2() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_u_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_u_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_u_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_5_0_2() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_0_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[20:25] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s24 -; GFX900-NEXT: s_mov_b32 s9, s25 -; GFX900-NEXT: s_mov_b32 s10, s24 -; GFX900-NEXT: s_mov_b32 s11, s25 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_0_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[20:25] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s24 -; GFX90A-NEXT: s_mov_b32 s9, s25 -; GFX90A-NEXT: s_mov_b32 s10, s24 -; GFX90A-NEXT: s_mov_b32 s11, s25 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_0_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_5_1_2() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_1_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_1_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_1_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_5_3_2() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_3_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[20:25] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s24 -; GFX900-NEXT: s_mov_b32 s9, s25 -; GFX900-NEXT: s_mov_b32 s10, s24 -; GFX900-NEXT: s_mov_b32 s11, s25 -; GFX900-NEXT: s_mov_b32 s12, s20 -; GFX900-NEXT: s_mov_b32 s13, s21 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_3_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[20:25] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s24 -; GFX90A-NEXT: s_mov_b32 s9, s25 -; GFX90A-NEXT: s_mov_b32 s10, s24 -; GFX90A-NEXT: s_mov_b32 s11, s25 -; GFX90A-NEXT: s_mov_b32 s12, s20 -; GFX90A-NEXT: s_mov_b32 s13, s21 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_3_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:17] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s16 -; GFX942-NEXT: s_mov_b32 s9, s17 -; GFX942-NEXT: s_mov_b32 s10, s16 -; GFX942-NEXT: s_mov_b32 s11, s17 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_5_4_2() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_4_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[20:25] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s24 -; GFX900-NEXT: s_mov_b32 s9, s25 -; GFX900-NEXT: s_mov_b32 s10, s24 -; GFX900-NEXT: s_mov_b32 s11, s25 -; GFX900-NEXT: s_mov_b32 s12, s22 -; GFX900-NEXT: s_mov_b32 s13, s23 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_4_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[20:25] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s24 -; GFX90A-NEXT: s_mov_b32 s9, s25 -; GFX90A-NEXT: s_mov_b32 s10, s24 -; GFX90A-NEXT: s_mov_b32 s11, s25 -; GFX90A-NEXT: s_mov_b32 s12, s22 -; GFX90A-NEXT: s_mov_b32 s13, s23 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_4_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:17] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s16 -; GFX942-NEXT: s_mov_b32 s9, s17 -; GFX942-NEXT: s_mov_b32 s10, s16 -; GFX942-NEXT: s_mov_b32 s11, s17 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__u_3_3_3() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__u_3_3_3: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__0_3_3_3() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__0_3_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__0_3_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__0_3_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__1_3_3_3() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__1_3_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__1_3_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__1_3_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__2_3_3_3() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__2_3_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__2_3_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__2_3_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__3_3_3_3() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_3_3_3: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__4_3_3_3() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__4_3_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__4_3_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__4_3_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_3_3_3() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_3_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_3_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_3_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_u_3_3() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_u_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_u_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_u_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_0_3_3() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_0_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_0_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_0_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_1_3_3() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_1_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_1_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_1_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_2_3_3() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_2_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_2_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_2_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[16:21] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s20 -; GFX942-NEXT: s_mov_b32 s9, s21 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s16 -; GFX942-NEXT: s_mov_b32 s13, s17 -; GFX942-NEXT: s_mov_b32 s14, s16 -; GFX942-NEXT: s_mov_b32 s15, s17 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4p0_v3p0__2_2_2_2() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__2_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s15, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_4_3_3() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_4_3_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s6 -; GFX900-NEXT: s_mov_b32 s11, s7 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_4_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s6 -; GFX90A-NEXT: s_mov_b32 s11, s7 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_4_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s2 -; GFX942-NEXT: s_mov_b32 s11, s3 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4p0_v3p0__3_2_2_2() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_5_3_3() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_3_3: +define void @s_shuffle_v4p0_v3p0__4_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__4_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_3_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_3_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__5_5_u_3() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_u_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: s_mov_b32 s14, s12 ; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART @@ -12767,16 +9531,19 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_3() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_u_3: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__4_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: s_mov_b32 s14, s12 ; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART @@ -12784,1637 +9551,1983 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_u_3: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__4_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_5_0_3() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_0_3: +define void @s_shuffle_v4p0_v3p0__5_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_0_3: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_0_3: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[16:21] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s20 -; GFX942-NEXT: s_mov_b32 s9, s21 -; GFX942-NEXT: s_mov_b32 s10, s20 -; GFX942-NEXT: s_mov_b32 s11, s21 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s16 -; GFX942-NEXT: s_mov_b32 s15, s17 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_5_1_3() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_1_3: +define void @s_shuffle_v4p0_v3p0__5_u_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_u_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_1_3: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_u_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_1_3: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_u_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[16:21] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s20 -; GFX942-NEXT: s_mov_b32 s9, s21 -; GFX942-NEXT: s_mov_b32 s10, s20 -; GFX942-NEXT: s_mov_b32 s11, s21 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s16 -; GFX942-NEXT: s_mov_b32 s15, s17 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_5_2_3() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_2_3: +define void @s_shuffle_v4p0_v3p0__5_0_2_2() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_0_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s12, s16 +; GFX9-NEXT: s_mov_b32 s13, s17 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_1_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_2_3: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_1_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_2_3: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_1_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_5_4_3() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_4_3: +define void @s_shuffle_v4p0_v3p0__5_3_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s12, s18 -; GFX900-NEXT: s_mov_b32 s13, s19 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_4_3: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_3_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s12, s18 -; GFX90A-NEXT: s_mov_b32 s13, s19 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_4_3: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_3_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 ; GFX942-NEXT: s_mov_b32 s10, s4 ; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__u_4_4_4() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__u_4_4_4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__0_4_4_4() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__0_4_4_4: +define void @s_shuffle_v4p0_v3p0__5_4_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_4_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__0_4_4_4: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_4_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__0_4_4_4: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_4_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s2 -; GFX942-NEXT: s_mov_b32 s11, s3 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_mov_b32 s10, s6 +; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__1_4_4_4() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__1_4_4_4: +define void @s_shuffle_v4p0_v3p0__5_5_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 -; GFX900-NEXT: s_mov_b32 s14, s10 -; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__1_4_4_4: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 -; GFX90A-NEXT: s_mov_b32 s14, s10 -; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__1_4_4_4: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 -; GFX942-NEXT: s_mov_b32 s14, s10 -; GFX942-NEXT: s_mov_b32 s15, s11 +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__2_4_4_4() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__2_4_4_4: +define void @s_shuffle_v4p0_v3p0__5_5_u_2() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 -; GFX900-NEXT: s_mov_b32 s14, s10 -; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__2_4_4_4: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 -; GFX90A-NEXT: s_mov_b32 s14, s10 -; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__2_4_4_4: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 -; GFX942-NEXT: s_mov_b32 s14, s10 -; GFX942-NEXT: s_mov_b32 s15, s11 +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__3_4_4_4() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_4_4_4: +define void @s_shuffle_v4p0_v3p0__5_5_0_2() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_0_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[12:17] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__4_4_4_4() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__4_4_4_4: +define void @s_shuffle_v4p0_v3p0__5_5_1_2() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_1_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_3_2() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_3_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_4_2() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_4_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[12:17] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_4_4_4() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_4_4_4: +define void @s_shuffle_v4p0_v3p0__u_3_3_3() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__u_3_3_3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_u_4_4() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_u_4_4: +define void @s_shuffle_v4p0_v3p0__0_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__0_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_u_4_4: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__0_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_u_4_4: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__0_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_0_4_4() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_0_4_4: +define void @s_shuffle_v4p0_v3p0__1_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__1_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_0_4_4: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__1_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_0_4_4: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__1_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: s_mov_b32 s14, s6 -; GFX942-NEXT: s_mov_b32 s15, s7 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_1_4_4() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_1_4_4: +define void @s_shuffle_v4p0_v3p0__2_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__2_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_1_4_4: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__2_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_1_4_4: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__2_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__3_3_3_3() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__4_3_3_3() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__4_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_2_4_4() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_2_4_4: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_2_4_4: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_2_4_4: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:17] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s16 -; GFX942-NEXT: s_mov_b32 s9, s17 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4p0_v3p0__5_3_3_3() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_3_4_4() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_3_4_4: +define void @s_shuffle_v4p0_v3p0__5_u_3_3() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_u_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_0_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_0_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_3_4_4: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_0_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_3_4_4: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_0_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 ; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_5_4_4() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_4_4: +define void @s_shuffle_v4p0_v3p0__5_1_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_1_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_4_4: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_1_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_4_4: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_1_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_5_u_4() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_u_4: +define void @s_shuffle_v4p0_v3p0__5_2_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_2_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_u_4: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_u_4: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_5_0_4() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_0_4: +define void @s_shuffle_v4p0_v3p0__5_4_3_3() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_4_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_3_3() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_u_3() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_u_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_0_3() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_0_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_1_3() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_1_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_0_4: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_1_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_0_4: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_1_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:17] +; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s16 -; GFX942-NEXT: s_mov_b32 s9, s17 -; GFX942-NEXT: s_mov_b32 s10, s16 -; GFX942-NEXT: s_mov_b32 s11, s17 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_5_1_4() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_1_4: +define void @s_shuffle_v4p0_v3p0__5_5_2_3() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_1_4: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_1_4: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:17] +; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s16 -; GFX942-NEXT: s_mov_b32 s9, s17 -; GFX942-NEXT: s_mov_b32 s10, s16 -; GFX942-NEXT: s_mov_b32 s11, s17 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_5_2_4() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_2_4: +define void @s_shuffle_v4p0_v3p0__5_5_4_3() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_4_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__u_4_4_4() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__u_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__0_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__0_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s14, s18 -; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_2_4: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__0_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s14, s18 -; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_2_4: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__0_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_5_3_4() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_3_4: +define void @s_shuffle_v4p0_v3p0__1_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__1_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_3_4: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__1_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_3_4: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__1_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__u_5_5_5() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__u_5_5_5: +define void @s_shuffle_v4p0_v3p0__2_4_4_4() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__2_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s14 +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__3_4_4_4() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_4_4_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:13] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__0_5_5_5() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__0_5_5_5: +define void @s_shuffle_v4p0_v3p0__4_4_4_4() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__4_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_4_4_4() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_u_4_4() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_u_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_0_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_0_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__0_5_5_5: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_0_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__0_5_5_5: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_0_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s14, s6 +; GFX942-NEXT: s_mov_b32 s15, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__1_5_5_5() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__1_5_5_5: +define void @s_shuffle_v4p0_v3p0__5_1_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_1_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__1_5_5_5: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_1_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__1_5_5_5: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_1_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s14, s6 +; GFX942-NEXT: s_mov_b32 s15, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__2_5_5_5() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__2_5_5_5: +define void @s_shuffle_v4p0_v3p0__5_2_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_2_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: s_mov_b32 s10, s12 ; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__2_5_5_5: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_2_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: s_mov_b32 s10, s12 ; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__2_5_5_5: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_2_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s14, s6 +; GFX942-NEXT: s_mov_b32 s15, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__3_5_5_5() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_5_5_5: +define void @s_shuffle_v4p0_v3p0__5_3_4_4() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_3_4_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__4_5_5_5() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__4_5_5_5: +define void @s_shuffle_v4p0_v3p0__5_5_4_4() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_4_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_u_5_5() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_u_5_5: +define void @s_shuffle_v4p0_v3p0__5_5_u_4() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_u_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ; def s[4:9] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_0_5_5() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_0_5_5: +define void @s_shuffle_v4p0_v3p0__5_5_0_4() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_0_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_1_4() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_1_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_0_5_5: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_1_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_1_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s14, s6 +; GFX942-NEXT: s_mov_b32 s15, s7 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_2_4() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_0_5_5: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_2_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: s_mov_b32 s14, s6 +; GFX942-NEXT: s_mov_b32 s15, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_1_5_5() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_1_5_5: +define void @s_shuffle_v4p0_v3p0__5_5_3_4() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_3_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__u_5_5_5() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__u_5_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__0_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__0_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -14423,8 +11536,8 @@ define void @s_shuffle_v4p0_v3p0__5_1_5_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: s_mov_b32 s12, s16 ; GFX900-NEXT: s_mov_b32 s13, s17 ; GFX900-NEXT: s_mov_b32 s14, s16 @@ -14434,7 +11547,7 @@ define void @s_shuffle_v4p0_v3p0__5_1_5_5() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_1_5_5: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__0_5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -14443,8 +11556,8 @@ define void @s_shuffle_v4p0_v3p0__5_1_5_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: s_mov_b32 s12, s16 ; GFX90A-NEXT: s_mov_b32 s13, s17 ; GFX90A-NEXT: s_mov_b32 s14, s16 @@ -14454,7 +11567,7 @@ define void @s_shuffle_v4p0_v3p0__5_1_5_5() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_1_5_5: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__0_5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -14463,8 +11576,8 @@ define void @s_shuffle_v4p0_v3p0__5_1_5_5() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 ; GFX942-NEXT: s_mov_b32 s12, s4 ; GFX942-NEXT: s_mov_b32 s13, s5 ; GFX942-NEXT: s_mov_b32 s14, s4 @@ -14475,25 +11588,25 @@ define void @s_shuffle_v4p0_v3p0__5_1_5_5() { ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_2_5_5() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_2_5_5: +define void @s_shuffle_v4p0_v3p0__1_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__1_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: s_mov_b32 s14, s12 ; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART @@ -14501,19 +11614,19 @@ define void @s_shuffle_v4p0_v3p0__5_2_5_5() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_2_5_5: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__1_5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: s_mov_b32 s14, s12 ; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART @@ -14521,7 +11634,7 @@ define void @s_shuffle_v4p0_v3p0__5_2_5_5() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_2_5_5: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__1_5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -14530,10 +11643,10 @@ define void @s_shuffle_v4p0_v3p0__5_2_5_5() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: s_mov_b32 s14, s12 ; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART @@ -14542,22 +11655,23 @@ define void @s_shuffle_v4p0_v3p0__5_2_5_5() { ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_3_5_5() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_3_5_5: +define void @s_shuffle_v4p0_v3p0__2_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__2_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: s_mov_b32 s12, s16 ; GFX900-NEXT: s_mov_b32 s13, s17 ; GFX900-NEXT: s_mov_b32 s14, s16 @@ -14567,16 +11681,17 @@ define void @s_shuffle_v4p0_v3p0__5_3_5_5() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_3_5_5: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__2_5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: s_mov_b32 s12, s16 ; GFX90A-NEXT: s_mov_b32 s13, s17 ; GFX90A-NEXT: s_mov_b32 s14, s16 @@ -14586,16 +11701,18 @@ define void @s_shuffle_v4p0_v3p0__5_3_5_5() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_3_5_5: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__2_5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 ; GFX942-NEXT: s_mov_b32 s12, s4 ; GFX942-NEXT: s_mov_b32 s13, s5 ; GFX942-NEXT: s_mov_b32 s14, s4 @@ -14606,20 +11723,20 @@ define void @s_shuffle_v4p0_v3p0__5_3_5_5() { ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_4_5_5() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_4_5_5: +define void @s_shuffle_v4p0_v3p0__3_5_5_5() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_5_5_5: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:13] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: s_mov_b32 s14, s12 ; GFX9-NEXT: s_mov_b32 s15, s13 ; GFX9-NEXT: ;;#ASMSTART @@ -14628,399 +11745,518 @@ define void @s_shuffle_v4p0_v3p0__5_4_5_5() { ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_5_u_5() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_u_5: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_u_5: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_u_5: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4p0_v3p0__4_5_5_5() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__4_5_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:13] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_5_0_5() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_0_5: +define void @s_shuffle_v4p0_v3p0__5_u_5_5() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_u_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s15, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_0_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_0_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_0_5: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_0_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_0_5: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_0_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:17] +; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s16 -; GFX942-NEXT: s_mov_b32 s9, s17 -; GFX942-NEXT: s_mov_b32 s10, s16 -; GFX942-NEXT: s_mov_b32 s11, s17 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s16 -; GFX942-NEXT: s_mov_b32 s15, s17 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s12, s8 +; GFX942-NEXT: s_mov_b32 s13, s9 +; GFX942-NEXT: s_mov_b32 s14, s8 +; GFX942-NEXT: s_mov_b32 s15, s9 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_5_1_5() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_1_5: +define void @s_shuffle_v4p0_v3p0__5_1_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_1_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_1_5: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_1_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_1_5: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_1_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:17] +; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s16 -; GFX942-NEXT: s_mov_b32 s9, s17 -; GFX942-NEXT: s_mov_b32 s10, s16 -; GFX942-NEXT: s_mov_b32 s11, s17 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s16 -; GFX942-NEXT: s_mov_b32 s15, s17 +; GFX942-NEXT: s_mov_b32 s12, s8 +; GFX942-NEXT: s_mov_b32 s13, s9 +; GFX942-NEXT: s_mov_b32 s14, s8 +; GFX942-NEXT: s_mov_b32 s15, s9 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_5_2_5() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_2_5: +define void @s_shuffle_v4p0_v3p0__5_2_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_2_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s14, s20 -; GFX900-NEXT: s_mov_b32 s15, s21 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_2_5: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_2_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s14, s20 -; GFX90A-NEXT: s_mov_b32 s15, s21 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_2_5: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_2_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s12, s8 +; GFX942-NEXT: s_mov_b32 s13, s9 +; GFX942-NEXT: s_mov_b32 s14, s8 +; GFX942-NEXT: s_mov_b32 s15, s9 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_5_3_5() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_3_5: +define void @s_shuffle_v4p0_v3p0__5_3_5_5() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_3_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s15, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_4_5_5() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_4_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s15, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_u_5() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_u_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s15, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_0_5() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_0_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:17] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s15, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_1_5() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_1_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_3_5: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_1_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_3_5: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_1_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s14, s8 +; GFX942-NEXT: s_mov_b32 s15, s9 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v3p0__5_5_4_5() { -; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_4_5: +define void @s_shuffle_v4p0_v3p0__5_5_2_5() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_2_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_4_5: +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_2_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_4_5: +; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_2_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[8:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: s_mov_b32 s14, s8 +; GFX942-NEXT: s_mov_b32 s15, s9 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_3_5() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_3_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s15, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_4_5() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_4_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s15, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll index 257af574366a6..3e354c8006fd3 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll @@ -139,39 +139,33 @@ define void @v_shuffle_v4p0_v4p0__3_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__3_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__3_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__3_u_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -275,39 +269,33 @@ define void @v_shuffle_v4p0_v4p0__7_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_u_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -321,55 +309,42 @@ define void @v_shuffle_v4p0_v4p0__7_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_0_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_0_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_0_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -383,49 +358,43 @@ define void @v_shuffle_v4p0_v4p0__7_1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_1_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_1_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -439,49 +408,43 @@ define void @v_shuffle_v4p0_v4p0__7_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_2_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_2_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_2_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -495,49 +458,43 @@ define void @v_shuffle_v4p0_v4p0__7_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_3_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_3_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -551,45 +508,40 @@ define void @v_shuffle_v4p0_v4p0__7_4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_4_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_4_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_4_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -603,39 +555,40 @@ define void @v_shuffle_v4p0_v4p0__7_5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_5_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_5_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_5_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -649,39 +602,40 @@ define void @v_shuffle_v4p0_v4p0__7_6_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_6_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_6_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_6_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -695,39 +649,40 @@ define void @v_shuffle_v4p0_v4p0__7_7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -741,51 +696,54 @@ define void @v_shuffle_v4p0_v4p0__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_0_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_0_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -799,51 +757,54 @@ define void @v_shuffle_v4p0_v4p0__7_7_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_1_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_1_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -857,51 +818,54 @@ define void @v_shuffle_v4p0_v4p0__7_7_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -915,57 +879,52 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v10, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, v14 +; GFX90A-NEXT: v_mov_b32_e32 v17, v15 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v16, v14 +; GFX942-NEXT: v_mov_b32_e32 v17, v15 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -979,42 +938,42 @@ define void @v_shuffle_v4p0_v4p0__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1028,45 +987,42 @@ define void @v_shuffle_v4p0_v4p0__7_7_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1080,45 +1036,42 @@ define void @v_shuffle_v4p0_v4p0__7_7_6_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_6_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_6_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_6_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1132,48 +1085,45 @@ define void @v_shuffle_v4p0_v4p0__7_7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_7_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_7_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_7_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1187,64 +1137,54 @@ define void @v_shuffle_v4p0_v4p0__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_7_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v8, v6 ; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_7_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_7_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1258,58 +1198,55 @@ define void @v_shuffle_v4p0_v4p0__7_7_7_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_7_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_7_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_7_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1323,58 +1260,55 @@ define void @v_shuffle_v4p0_v4p0__7_7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_7_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_7_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_7_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1388,57 +1322,55 @@ define void @v_shuffle_v4p0_v4p0__7_7_7_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_7_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_7_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_7_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1452,57 +1384,52 @@ define void @v_shuffle_v4p0_v4p0__7_7_7_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_7_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_7_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_7_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1516,48 +1443,52 @@ define void @v_shuffle_v4p0_v4p0__7_7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_7_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1571,51 +1502,52 @@ define void @v_shuffle_v4p0_v4p0__7_7_7_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_7_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_7_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_7_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1629,42 +1561,43 @@ define void @v_shuffle_v4p0_v4p0__7_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1679,13 +1612,13 @@ define void @v_shuffle_v4p0_v4p0__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1693,13 +1626,13 @@ define void @v_shuffle_v4p0_v4p0__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1707,13 +1640,13 @@ define void @v_shuffle_v4p0_v4p0__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1775,15 +1708,15 @@ define void @v_shuffle_v4p0_v4p0__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1791,15 +1724,15 @@ define void @v_shuffle_v4p0_v4p0__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1807,15 +1740,15 @@ define void @v_shuffle_v4p0_v4p0__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1885,15 +1818,13 @@ define void @v_shuffle_v4p0_v4p0__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1903,15 +1834,13 @@ define void @v_shuffle_v4p0_v4p0__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1921,15 +1850,13 @@ define void @v_shuffle_v4p0_v4p0__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1943,13 +1870,13 @@ define void @v_shuffle_v4p0_v4p0__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1957,13 +1884,13 @@ define void @v_shuffle_v4p0_v4p0__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1971,13 +1898,13 @@ define void @v_shuffle_v4p0_v4p0__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2121,20 +2048,16 @@ define void @v_shuffle_v4p0_v4p0__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2142,20 +2065,16 @@ define void @v_shuffle_v4p0_v4p0__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2163,21 +2082,16 @@ define void @v_shuffle_v4p0_v4p0__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2192,19 +2106,16 @@ define void @v_shuffle_v4p0_v4p0__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2212,19 +2123,16 @@ define void @v_shuffle_v4p0_v4p0__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2232,20 +2140,16 @@ define void @v_shuffle_v4p0_v4p0__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2260,64 +2164,58 @@ define void @v_shuffle_v4p0_v4p0__7_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_1_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v14, v4 +; GFX90A-NEXT: v_mov_b32_e32 v15, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_1_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v14, v4 +; GFX942-NEXT: v_mov_b32_e32 v15, v5 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2332,19 +2230,19 @@ define void @v_shuffle_v4p0_v4p0__7_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2352,19 +2250,19 @@ define void @v_shuffle_v4p0_v4p0__7_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v12 +; GFX90A-NEXT: v_mov_b32_e32 v9, v13 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2372,19 +2270,19 @@ define void @v_shuffle_v4p0_v4p0__7_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v12 +; GFX942-NEXT: v_mov_b32_e32 v9, v13 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2399,18 +2297,19 @@ define void @v_shuffle_v4p0_v4p0__7_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v14 +; GFX900-NEXT: v_mov_b32_e32 v9, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2418,18 +2317,19 @@ define void @v_shuffle_v4p0_v4p0__7_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v14 +; GFX90A-NEXT: v_mov_b32_e32 v9, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2437,18 +2337,19 @@ define void @v_shuffle_v4p0_v4p0__7_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v14 +; GFX942-NEXT: v_mov_b32_e32 v9, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2463,21 +2364,19 @@ define void @v_shuffle_v4p0_v4p0__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2485,21 +2384,19 @@ define void @v_shuffle_v4p0_v4p0__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2507,21 +2404,19 @@ define void @v_shuffle_v4p0_v4p0__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2536,19 +2431,19 @@ define void @v_shuffle_v4p0_v4p0__7_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2556,19 +2451,19 @@ define void @v_shuffle_v4p0_v4p0__7_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2576,20 +2471,19 @@ define void @v_shuffle_v4p0_v4p0__7_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2604,18 +2498,19 @@ define void @v_shuffle_v4p0_v4p0__7_6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v8 -; GFX900-NEXT: v_mov_b32_e32 v5, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2623,18 +2518,19 @@ define void @v_shuffle_v4p0_v4p0__7_6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v8 -; GFX90A-NEXT: v_mov_b32_e32 v5, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2642,19 +2538,19 @@ define void @v_shuffle_v4p0_v4p0__7_6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v8 -; GFX942-NEXT: v_mov_b32_e32 v5, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2669,18 +2565,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2688,18 +2585,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2707,19 +2605,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2733,19 +2631,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v0 -; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2753,18 +2650,16 @@ define void @v_shuffle_v4p0_v4p0__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2772,19 +2667,16 @@ define void @v_shuffle_v4p0_v4p0__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 +; GFX942-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2799,18 +2691,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, v8 +; GFX900-NEXT: v_mov_b32_e32 v13, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2818,18 +2710,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v12, v8 +; GFX90A-NEXT: v_mov_b32_e32 v13, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2837,19 +2729,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v12, v8 +; GFX942-NEXT: v_mov_b32_e32 v13, v9 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2864,18 +2755,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v14, v8 +; GFX900-NEXT: v_mov_b32_e32 v15, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2883,18 +2774,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v14, v8 +; GFX90A-NEXT: v_mov_b32_e32 v15, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2902,19 +2793,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_2_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v14, v8 +; GFX942-NEXT: v_mov_b32_e32 v15, v9 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2928,21 +2818,20 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2950,20 +2839,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v16, v14 +; GFX90A-NEXT: v_mov_b32_e32 v17, v15 +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2971,20 +2858,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v16, v14 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v17, v15 +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2999,18 +2884,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3018,18 +2903,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3037,19 +2922,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v8 +; GFX942-NEXT: v_mov_b32_e32 v3, v9 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -3064,19 +2948,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3084,19 +2967,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3104,20 +2986,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v8 +; GFX942-NEXT: v_mov_b32_e32 v5, v9 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -3132,20 +3012,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_6_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3153,20 +3031,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_6_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 +; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3174,26 +3050,24 @@ define void @v_shuffle_v4p0_v4p0__7_7_6_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] -; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=v"() - %vec1 = call <4 x ptr> asm "; def $0", "=v"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: v_mov_b32_e32 v1, v9 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 +; GFX942-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 ret void } @@ -3403,13 +3277,13 @@ define void @v_shuffle_v4p0_v4p0__3_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3419,13 +3293,13 @@ define void @v_shuffle_v4p0_v4p0__3_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3435,13 +3309,13 @@ define void @v_shuffle_v4p0_v4p0__3_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 ; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -3633,18 +3507,16 @@ define void @v_shuffle_v4p0_v4p0__7_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3652,18 +3524,16 @@ define void @v_shuffle_v4p0_v4p0__7_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3671,19 +3541,16 @@ define void @v_shuffle_v4p0_v4p0__7_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -3698,18 +3565,16 @@ define void @v_shuffle_v4p0_v4p0__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3717,18 +3582,16 @@ define void @v_shuffle_v4p0_v4p0__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3736,19 +3599,16 @@ define void @v_shuffle_v4p0_v4p0__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -3763,20 +3623,16 @@ define void @v_shuffle_v4p0_v4p0__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3784,21 +3640,16 @@ define void @v_shuffle_v4p0_v4p0__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3806,22 +3657,16 @@ define void @v_shuffle_v4p0_v4p0__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -3836,19 +3681,19 @@ define void @v_shuffle_v4p0_v4p0__7_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3856,19 +3701,19 @@ define void @v_shuffle_v4p0_v4p0__7_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v12 +; GFX90A-NEXT: v_mov_b32_e32 v9, v13 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3876,19 +3721,19 @@ define void @v_shuffle_v4p0_v4p0__7_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v10 +; GFX942-NEXT: v_mov_b32_e32 v9, v11 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v12 +; GFX942-NEXT: v_mov_b32_e32 v9, v13 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -3903,19 +3748,19 @@ define void @v_shuffle_v4p0_v4p0__7_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3923,19 +3768,19 @@ define void @v_shuffle_v4p0_v4p0__7_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v12 +; GFX90A-NEXT: v_mov_b32_e32 v9, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3943,19 +3788,19 @@ define void @v_shuffle_v4p0_v4p0__7_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v12 +; GFX942-NEXT: v_mov_b32_e32 v9, v13 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -3970,20 +3815,19 @@ define void @v_shuffle_v4p0_v4p0__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v12 -; GFX900-NEXT: v_mov_b32_e32 v1, v13 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3991,20 +3835,19 @@ define void @v_shuffle_v4p0_v4p0__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v12 -; GFX90A-NEXT: v_mov_b32_e32 v1, v13 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4012,20 +3855,19 @@ define void @v_shuffle_v4p0_v4p0__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v14, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v12 -; GFX942-NEXT: v_mov_b32_e32 v1, v13 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -4040,19 +3882,19 @@ define void @v_shuffle_v4p0_v4p0__7_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v10 -; GFX900-NEXT: v_mov_b32_e32 v5, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4060,19 +3902,19 @@ define void @v_shuffle_v4p0_v4p0__7_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v10 -; GFX90A-NEXT: v_mov_b32_e32 v5, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4080,20 +3922,19 @@ define void @v_shuffle_v4p0_v4p0__7_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v10 -; GFX942-NEXT: v_mov_b32_e32 v5, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -4108,18 +3949,19 @@ define void @v_shuffle_v4p0_v4p0__7_6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v10 -; GFX900-NEXT: v_mov_b32_e32 v7, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4127,18 +3969,19 @@ define void @v_shuffle_v4p0_v4p0__7_6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v10 -; GFX90A-NEXT: v_mov_b32_e32 v7, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4146,19 +3989,19 @@ define void @v_shuffle_v4p0_v4p0__7_6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v10 -; GFX942-NEXT: v_mov_b32_e32 v7, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -4173,18 +4016,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4192,18 +4036,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4211,19 +4056,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -4237,51 +4082,55 @@ define void @v_shuffle_v4p0_v4p0__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_u_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_u_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -4295,51 +4144,54 @@ define void @v_shuffle_v4p0_v4p0__7_7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_0_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_0_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -4354,18 +4206,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v8 +; GFX900-NEXT: v_mov_b32_e32 v13, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4373,18 +4225,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, v8 +; GFX90A-NEXT: v_mov_b32_e32 v13, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4392,19 +4244,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_2_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v12, v8 +; GFX942-NEXT: v_mov_b32_e32 v13, v9 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -4418,19 +4269,20 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4438,18 +4290,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v16, v14 +; GFX90A-NEXT: v_mov_b32_e32 v17, v15 +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4457,18 +4309,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v16, v14 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: v_mov_b32_e32 v17, v15 +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -4483,18 +4335,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4502,18 +4354,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4521,19 +4373,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v8 +; GFX942-NEXT: v_mov_b32_e32 v3, v9 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -4548,19 +4400,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v2 -; GFX900-NEXT: v_mov_b32_e32 v9, v3 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4568,19 +4419,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4588,20 +4438,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v8 +; GFX942-NEXT: v_mov_b32_e32 v5, v9 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -4616,18 +4465,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_6_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v8 -; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4635,18 +4485,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_6_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4654,19 +4505,20 @@ define void @v_shuffle_v4p0_v4p0__7_7_6_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v8 -; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -4881,14 +4733,13 @@ define void @v_shuffle_v4p0_v4p0__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4898,14 +4749,13 @@ define void @v_shuffle_v4p0_v4p0__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4915,14 +4765,13 @@ define void @v_shuffle_v4p0_v4p0__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -5114,18 +4963,16 @@ define void @v_shuffle_v4p0_v4p0__7_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5133,18 +4980,16 @@ define void @v_shuffle_v4p0_v4p0__7_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5152,19 +4997,16 @@ define void @v_shuffle_v4p0_v4p0__7_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -5179,18 +5021,16 @@ define void @v_shuffle_v4p0_v4p0__7_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v12 -; GFX900-NEXT: v_mov_b32_e32 v1, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5198,18 +5038,16 @@ define void @v_shuffle_v4p0_v4p0__7_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v0, v12 -; GFX90A-NEXT: v_mov_b32_e32 v1, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5217,19 +5055,16 @@ define void @v_shuffle_v4p0_v4p0__7_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v0, v12 -; GFX942-NEXT: v_mov_b32_e32 v1, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -5244,20 +5079,16 @@ define void @v_shuffle_v4p0_v4p0__7_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v12 +; GFX900-NEXT: v_mov_b32_e32 v15, v13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5265,20 +5096,16 @@ define void @v_shuffle_v4p0_v4p0__7_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v12 +; GFX90A-NEXT: v_mov_b32_e32 v15, v13 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5286,21 +5113,16 @@ define void @v_shuffle_v4p0_v4p0__7_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v12 +; GFX942-NEXT: v_mov_b32_e32 v15, v13 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -5315,18 +5137,16 @@ define void @v_shuffle_v4p0_v4p0__7_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v12 -; GFX900-NEXT: v_mov_b32_e32 v1, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5334,18 +5154,16 @@ define void @v_shuffle_v4p0_v4p0__7_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v0, v12 -; GFX90A-NEXT: v_mov_b32_e32 v1, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5353,19 +5171,16 @@ define void @v_shuffle_v4p0_v4p0__7_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v0, v12 -; GFX942-NEXT: v_mov_b32_e32 v1, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -5380,19 +5195,19 @@ define void @v_shuffle_v4p0_v4p0__7_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5400,19 +5215,19 @@ define void @v_shuffle_v4p0_v4p0__7_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v12 +; GFX90A-NEXT: v_mov_b32_e32 v9, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5420,19 +5235,19 @@ define void @v_shuffle_v4p0_v4p0__7_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v10 +; GFX942-NEXT: v_mov_b32_e32 v9, v11 +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v12 +; GFX942-NEXT: v_mov_b32_e32 v9, v13 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -5447,20 +5262,19 @@ define void @v_shuffle_v4p0_v4p0__7_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v14 -; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5468,20 +5282,19 @@ define void @v_shuffle_v4p0_v4p0__7_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v0, v14 -; GFX90A-NEXT: v_mov_b32_e32 v1, v15 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5489,20 +5302,19 @@ define void @v_shuffle_v4p0_v4p0__7_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v0, v14 -; GFX942-NEXT: v_mov_b32_e32 v1, v15 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -5517,19 +5329,19 @@ define void @v_shuffle_v4p0_v4p0__7_5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v12 -; GFX900-NEXT: v_mov_b32_e32 v7, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5537,19 +5349,19 @@ define void @v_shuffle_v4p0_v4p0__7_5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v12 -; GFX90A-NEXT: v_mov_b32_e32 v7, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5557,20 +5369,19 @@ define void @v_shuffle_v4p0_v4p0__7_5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v6, v12 -; GFX942-NEXT: v_mov_b32_e32 v7, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -5585,18 +5396,19 @@ define void @v_shuffle_v4p0_v4p0__7_6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v8, v12 -; GFX900-NEXT: v_mov_b32_e32 v9, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5604,18 +5416,19 @@ define void @v_shuffle_v4p0_v4p0__7_6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v8, v12 -; GFX90A-NEXT: v_mov_b32_e32 v9, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5623,19 +5436,19 @@ define void @v_shuffle_v4p0_v4p0__7_6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v8, v12 -; GFX942-NEXT: v_mov_b32_e32 v9, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -5650,18 +5463,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5669,18 +5483,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5688,19 +5503,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -5714,51 +5529,55 @@ define void @v_shuffle_v4p0_v4p0__7_7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] -; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -5773,18 +5592,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v10, v12 ; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5792,18 +5612,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: v_mov_b32_e32 v10, v12 ; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5811,18 +5632,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: v_mov_b32_e32 v10, v12 ; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -5836,51 +5658,54 @@ define void @v_shuffle_v4p0_v4p0__7_7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_1_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_1_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -5894,19 +5719,20 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_3_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5914,18 +5740,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v16, v14 +; GFX90A-NEXT: v_mov_b32_e32 v17, v15 +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5933,18 +5759,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v16, v14 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: v_mov_b32_e32 v17, v15 +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -5959,18 +5785,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_4_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5978,18 +5804,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5997,19 +5823,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_4_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v8 +; GFX942-NEXT: v_mov_b32_e32 v3, v9 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -6024,19 +5850,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v4 -; GFX900-NEXT: v_mov_b32_e32 v11, v5 -; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6044,19 +5869,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v4 -; GFX90A-NEXT: v_mov_b32_e32 v11, v5 -; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6064,20 +5888,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v4 -; GFX942-NEXT: v_mov_b32_e32 v11, v5 -; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v8 +; GFX942-NEXT: v_mov_b32_e32 v5, v9 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -6092,18 +5915,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_6_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v10 -; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6111,18 +5935,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_6_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6130,19 +5955,20 @@ define void @v_shuffle_v4p0_v4p0__7_7_6_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v10 -; GFX942-NEXT: v_mov_b32_e32 v3, v11 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -6156,42 +5982,43 @@ define void @v_shuffle_v4p0_v4p0__u_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__u_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__u_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__u_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -6207,13 +6034,13 @@ define void @v_shuffle_v4p0_v4p0__0_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: v_mov_b32_e32 v2, v6 ; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6223,13 +6050,13 @@ define void @v_shuffle_v4p0_v4p0__0_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: v_mov_b32_e32 v2, v6 ; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6239,13 +6066,13 @@ define void @v_shuffle_v4p0_v4p0__0_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: v_mov_b32_e32 v2, v6 ; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -6261,11 +6088,13 @@ define void @v_shuffle_v4p0_v4p0__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6275,11 +6104,13 @@ define void @v_shuffle_v4p0_v4p0__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: v_mov_b32_e32 v4, v6 ; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6289,11 +6120,13 @@ define void @v_shuffle_v4p0_v4p0__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -6306,48 +6139,43 @@ define void @v_shuffle_v4p0_v4p0__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__2_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__2_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__2_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -6360,42 +6188,43 @@ define void @v_shuffle_v4p0_v4p0__3_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__3_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__3_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__3_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -6408,42 +6237,43 @@ define void @v_shuffle_v4p0_v4p0__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__4_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__4_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__4_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -6457,17 +6287,17 @@ define void @v_shuffle_v4p0_v4p0__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: v_mov_b32_e32 v12, v6 ; GFX900-NEXT: v_mov_b32_e32 v13, v7 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -6476,17 +6306,17 @@ define void @v_shuffle_v4p0_v4p0__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: v_mov_b32_e32 v12, v6 ; GFX90A-NEXT: v_mov_b32_e32 v13, v7 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6495,17 +6325,18 @@ define void @v_shuffle_v4p0_v4p0__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: v_mov_b32_e32 v12, v6 ; GFX942-NEXT: v_mov_b32_e32 v13, v7 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -6521,17 +6352,17 @@ define void @v_shuffle_v4p0_v4p0__6_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: v_mov_b32_e32 v14, v6 ; GFX900-NEXT: v_mov_b32_e32 v15, v7 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -6540,17 +6371,17 @@ define void @v_shuffle_v4p0_v4p0__6_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: v_mov_b32_e32 v14, v6 ; GFX90A-NEXT: v_mov_b32_e32 v15, v7 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6559,17 +6390,18 @@ define void @v_shuffle_v4p0_v4p0__6_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: v_mov_b32_e32 v14, v6 ; GFX942-NEXT: v_mov_b32_e32 v15, v7 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -6585,19 +6417,16 @@ define void @v_shuffle_v4p0_v4p0__7_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6605,19 +6434,16 @@ define void @v_shuffle_v4p0_v4p0__7_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6625,19 +6451,16 @@ define void @v_shuffle_v4p0_v4p0__7_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -6651,19 +6474,18 @@ define void @v_shuffle_v4p0_v4p0__7_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_u_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v0, v14 -; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6671,18 +6493,16 @@ define void @v_shuffle_v4p0_v4p0__7_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v0, v14 -; GFX90A-NEXT: v_mov_b32_e32 v1, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6690,18 +6510,17 @@ define void @v_shuffle_v4p0_v4p0__7_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v0, v14 -; GFX942-NEXT: v_mov_b32_e32 v1, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -6718,60 +6537,49 @@ define void @v_shuffle_v4p0_v4p0__7_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, v14 +; GFX900-NEXT: v_mov_b32_e32 v17, v15 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_0_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v16, v14 +; GFX90A-NEXT: v_mov_b32_e32 v17, v15 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v14 -; GFX90A-NEXT: v_mov_b32_e32 v3, v15 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_0_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v14 -; GFX942-NEXT: v_mov_b32_e32 v3, v15 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v14 +; GFX942-NEXT: v_mov_b32_e32 v17, v15 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -6785,57 +6593,52 @@ define void @v_shuffle_v4p0_v4p0__7_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_1_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v12 +; GFX900-NEXT: v_mov_b32_e32 v15, v13 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v0, v14 -; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_1_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v12 +; GFX90A-NEXT: v_mov_b32_e32 v15, v13 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v0, v14 -; GFX90A-NEXT: v_mov_b32_e32 v1, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_1_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v12 +; GFX942-NEXT: v_mov_b32_e32 v15, v13 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v0, v14 -; GFX942-NEXT: v_mov_b32_e32 v1, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -6849,66 +6652,52 @@ define void @v_shuffle_v4p0_v4p0__7_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_2_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v14 -; GFX90A-NEXT: v_mov_b32_e32 v3, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v14 -; GFX942-NEXT: v_mov_b32_e32 v3, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -6922,63 +6711,59 @@ define void @v_shuffle_v4p0_v4p0__7_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_4_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v0, v14 -; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v20, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[10:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v0, v14 -; GFX90A-NEXT: v_mov_b32_e32 v1, v15 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v18, v10 +; GFX90A-NEXT: v_mov_b32_e32 v19, v11 +; GFX90A-NEXT: global_store_dwordx4 v20, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v20, v[16:19], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v20, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[10:17] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v0, v14 -; GFX942-NEXT: v_mov_b32_e32 v1, v15 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: v_mov_b32_e32 v18, v10 +; GFX942-NEXT: v_mov_b32_e32 v19, v11 +; GFX942-NEXT: global_store_dwordx4 v20, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v20, v[16:19], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -6992,19 +6777,20 @@ define void @v_shuffle_v4p0_v4p0__7_5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_5_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v8, v14 -; GFX900-NEXT: v_mov_b32_e32 v9, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7012,18 +6798,18 @@ define void @v_shuffle_v4p0_v4p0__7_5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v8, v14 -; GFX90A-NEXT: v_mov_b32_e32 v9, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v16, v10 +; GFX90A-NEXT: v_mov_b32_e32 v17, v11 +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7031,18 +6817,18 @@ define void @v_shuffle_v4p0_v4p0__7_5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v8, v14 -; GFX942-NEXT: v_mov_b32_e32 v9, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v16, v10 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: v_mov_b32_e32 v17, v11 +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -7056,19 +6842,20 @@ define void @v_shuffle_v4p0_v4p0__7_6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_6_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v10, v14 -; GFX900-NEXT: v_mov_b32_e32 v11, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7076,18 +6863,18 @@ define void @v_shuffle_v4p0_v4p0__7_6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v10, v14 -; GFX90A-NEXT: v_mov_b32_e32 v11, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v16, v12 +; GFX90A-NEXT: v_mov_b32_e32 v17, v13 +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7095,18 +6882,18 @@ define void @v_shuffle_v4p0_v4p0__7_6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v10, v14 -; GFX942-NEXT: v_mov_b32_e32 v11, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v16, v12 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: v_mov_b32_e32 v17, v13 +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -7120,19 +6907,20 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7140,18 +6928,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v16, v14 +; GFX90A-NEXT: v_mov_b32_e32 v17, v15 +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7159,18 +6947,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v16, v14 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: v_mov_b32_e32 v17, v15 +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -7184,51 +6972,55 @@ define void @v_shuffle_v4p0_v4p0__7_7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -7243,18 +7035,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v14 +; GFX900-NEXT: v_mov_b32_e32 v11, v15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7262,18 +7055,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v14 +; GFX90A-NEXT: v_mov_b32_e32 v11, v15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7281,18 +7075,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v14 +; GFX942-NEXT: v_mov_b32_e32 v11, v15 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -7307,18 +7102,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7326,18 +7122,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7345,18 +7142,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_1_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v12 +; GFX942-NEXT: v_mov_b32_e32 v11, v13 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -7370,51 +7168,54 @@ define void @v_shuffle_v4p0_v4p0__7_7_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -7429,18 +7230,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_4_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v6 -; GFX900-NEXT: v_mov_b32_e32 v11, v7 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7448,18 +7249,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, v6 -; GFX90A-NEXT: v_mov_b32_e32 v11, v7 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7467,18 +7268,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_4_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v10, v6 -; GFX942-NEXT: v_mov_b32_e32 v11, v7 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v8 +; GFX942-NEXT: v_mov_b32_e32 v3, v9 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -7493,19 +7295,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_5_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v6 -; GFX900-NEXT: v_mov_b32_e32 v13, v7 -; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7513,19 +7314,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_5_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, v6 -; GFX90A-NEXT: v_mov_b32_e32 v13, v7 -; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7533,20 +7333,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_5_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v12, v6 -; GFX942-NEXT: v_mov_b32_e32 v13, v7 -; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v8 +; GFX942-NEXT: v_mov_b32_e32 v5, v9 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -7561,18 +7360,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_6_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v12 -; GFX900-NEXT: v_mov_b32_e32 v5, v13 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7580,18 +7380,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_6_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v12 -; GFX90A-NEXT: v_mov_b32_e32 v5, v13 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7599,18 +7400,20 @@ define void @v_shuffle_v4p0_v4p0__7_7_6_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v12 -; GFX942-NEXT: v_mov_b32_e32 v5, v13 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -7755,39 +7558,33 @@ define void @v_shuffle_v4p0_v4p0__3_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__3_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__3_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__3_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -7812,15 +7609,15 @@ define void @v_shuffle_v4p0_v4p0__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7828,15 +7625,15 @@ define void @v_shuffle_v4p0_v4p0__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7844,15 +7641,15 @@ define void @v_shuffle_v4p0_v4p0__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -7924,15 +7721,13 @@ define void @v_shuffle_v4p0_v4p0__7_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7942,15 +7737,13 @@ define void @v_shuffle_v4p0_v4p0__7_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7960,15 +7753,13 @@ define void @v_shuffle_v4p0_v4p0__7_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -7989,10 +7780,7 @@ define void @v_shuffle_v4p0_v4p0__7_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8006,10 +7794,7 @@ define void @v_shuffle_v4p0_v4p0__7_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8023,10 +7808,7 @@ define void @v_shuffle_v4p0_v4p0__7_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -8043,18 +7825,14 @@ define void @v_shuffle_v4p0_v4p0__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8064,19 +7842,14 @@ define void @v_shuffle_v4p0_v4p0__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8086,20 +7859,14 @@ define void @v_shuffle_v4p0_v4p0__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -8114,18 +7881,16 @@ define void @v_shuffle_v4p0_v4p0__7_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8133,18 +7898,16 @@ define void @v_shuffle_v4p0_v4p0__7_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8152,19 +7915,17 @@ define void @v_shuffle_v4p0_v4p0__7_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -8179,18 +7940,16 @@ define void @v_shuffle_v4p0_v4p0__7_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8198,18 +7957,16 @@ define void @v_shuffle_v4p0_v4p0__7_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v6 -; GFX90A-NEXT: v_mov_b32_e32 v9, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8217,19 +7974,17 @@ define void @v_shuffle_v4p0_v4p0__7_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v6 -; GFX942-NEXT: v_mov_b32_e32 v9, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -8244,18 +7999,16 @@ define void @v_shuffle_v4p0_v4p0__7_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v8 -; GFX900-NEXT: v_mov_b32_e32 v11, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8263,18 +8016,16 @@ define void @v_shuffle_v4p0_v4p0__7_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v8 -; GFX90A-NEXT: v_mov_b32_e32 v11, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8282,18 +8033,17 @@ define void @v_shuffle_v4p0_v4p0__7_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v8 -; GFX942-NEXT: v_mov_b32_e32 v11, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -8307,54 +8057,49 @@ define void @v_shuffle_v4p0_v4p0__7_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_5_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v0 -; GFX900-NEXT: v_mov_b32_e32 v9, v1 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: v_mov_b32_e32 v12, v0 -; GFX90A-NEXT: v_mov_b32_e32 v13, v1 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v12, v0 -; GFX942-NEXT: v_mov_b32_e32 v13, v1 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -8371,14 +8116,13 @@ define void @v_shuffle_v4p0_v4p0__7_6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8388,14 +8132,13 @@ define void @v_shuffle_v4p0_v4p0__7_6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8405,14 +8148,13 @@ define void @v_shuffle_v4p0_v4p0__7_6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -8429,13 +8171,13 @@ define void @v_shuffle_v4p0_v4p0__7_7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8445,13 +8187,13 @@ define void @v_shuffle_v4p0_v4p0__7_7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8461,13 +8203,13 @@ define void @v_shuffle_v4p0_v4p0__7_7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -8481,48 +8223,42 @@ define void @v_shuffle_v4p0_v4p0__7_7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -8539,14 +8275,14 @@ define void @v_shuffle_v4p0_v4p0__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8556,14 +8292,14 @@ define void @v_shuffle_v4p0_v4p0__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8573,14 +8309,14 @@ define void @v_shuffle_v4p0_v4p0__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -8597,14 +8333,14 @@ define void @v_shuffle_v4p0_v4p0__7_7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8614,14 +8350,14 @@ define void @v_shuffle_v4p0_v4p0__7_7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8631,14 +8367,14 @@ define void @v_shuffle_v4p0_v4p0__7_7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -8655,14 +8391,14 @@ define void @v_shuffle_v4p0_v4p0__7_7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v14, v12 +; GFX900-NEXT: v_mov_b32_e32 v15, v13 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8672,14 +8408,14 @@ define void @v_shuffle_v4p0_v4p0__7_7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v14, v12 +; GFX90A-NEXT: v_mov_b32_e32 v15, v13 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8689,14 +8425,14 @@ define void @v_shuffle_v4p0_v4p0__7_7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v14, v12 +; GFX942-NEXT: v_mov_b32_e32 v15, v13 +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -8713,60 +8449,48 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, v14 +; GFX900-NEXT: v_mov_b32_e32 v17, v15 +; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, v14 +; GFX90A-NEXT: v_mov_b32_e32 v17, v15 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v16, v14 +; GFX942-NEXT: v_mov_b32_e32 v17, v15 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -8783,14 +8507,13 @@ define void @v_shuffle_v4p0_v4p0__7_7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8800,14 +8523,13 @@ define void @v_shuffle_v4p0_v4p0__7_7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8817,14 +8539,13 @@ define void @v_shuffle_v4p0_v4p0__7_7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -8838,57 +8559,49 @@ define void @v_shuffle_v4p0_v4p0__7_7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_6_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_6_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_6_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v7 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -9150,15 +8863,15 @@ define void @v_shuffle_v4p0_v4p0__3_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v10 -; GFX900-NEXT: v_mov_b32_e32 v13, v11 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9169,15 +8882,15 @@ define void @v_shuffle_v4p0_v4p0__3_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, v10 -; GFX90A-NEXT: v_mov_b32_e32 v13, v11 -; GFX90A-NEXT: v_mov_b32_e32 v8, v6 -; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9188,15 +8901,15 @@ define void @v_shuffle_v4p0_v4p0__3_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 +; GFX942-NEXT: v_mov_b32_e32 v8, v10 +; GFX942-NEXT: v_mov_b32_e32 v9, v11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, v10 -; GFX942-NEXT: v_mov_b32_e32 v13, v11 -; GFX942-NEXT: v_mov_b32_e32 v8, v6 -; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -9366,13 +9079,13 @@ define void @v_shuffle_v4p0_v4p0__7_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9382,13 +9095,13 @@ define void @v_shuffle_v4p0_v4p0__7_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9398,13 +9111,13 @@ define void @v_shuffle_v4p0_v4p0__7_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 ; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -9424,10 +9137,8 @@ define void @v_shuffle_v4p0_v4p0__7_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 ; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9440,10 +9151,8 @@ define void @v_shuffle_v4p0_v4p0__7_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 ; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9456,10 +9165,8 @@ define void @v_shuffle_v4p0_v4p0__7_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 ; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 ; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -9476,18 +9183,14 @@ define void @v_shuffle_v4p0_v4p0__7_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9497,18 +9200,14 @@ define void @v_shuffle_v4p0_v4p0__7_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9518,19 +9217,14 @@ define void @v_shuffle_v4p0_v4p0__7_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -9545,18 +9239,16 @@ define void @v_shuffle_v4p0_v4p0__7_1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9564,18 +9256,16 @@ define void @v_shuffle_v4p0_v4p0__7_1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v6 -; GFX90A-NEXT: v_mov_b32_e32 v9, v7 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9583,19 +9273,17 @@ define void @v_shuffle_v4p0_v4p0__7_1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v6 -; GFX942-NEXT: v_mov_b32_e32 v9, v7 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -9610,18 +9298,16 @@ define void @v_shuffle_v4p0_v4p0__7_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v8 -; GFX900-NEXT: v_mov_b32_e32 v11, v9 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9629,18 +9315,16 @@ define void @v_shuffle_v4p0_v4p0__7_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v8 -; GFX90A-NEXT: v_mov_b32_e32 v11, v9 -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9648,19 +9332,17 @@ define void @v_shuffle_v4p0_v4p0__7_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v8 -; GFX942-NEXT: v_mov_b32_e32 v11, v9 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -9675,18 +9357,16 @@ define void @v_shuffle_v4p0_v4p0__7_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v12, v10 -; GFX900-NEXT: v_mov_b32_e32 v13, v11 -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9694,18 +9374,16 @@ define void @v_shuffle_v4p0_v4p0__7_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, v10 -; GFX90A-NEXT: v_mov_b32_e32 v13, v11 -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9713,20 +9391,19 @@ define void @v_shuffle_v4p0_v4p0__7_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v12, v10 -; GFX942-NEXT: v_mov_b32_e32 v13, v11 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] -; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() %vec1 = call <4 x ptr> asm "; def $0", "=v"() %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> @@ -9741,16 +9418,13 @@ define void @v_shuffle_v4p0_v4p0__7_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9760,16 +9434,13 @@ define void @v_shuffle_v4p0_v4p0__7_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9779,16 +9450,13 @@ define void @v_shuffle_v4p0_v4p0__7_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 ; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -9805,14 +9473,13 @@ define void @v_shuffle_v4p0_v4p0__7_6_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9822,14 +9489,13 @@ define void @v_shuffle_v4p0_v4p0__7_6_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9839,14 +9505,13 @@ define void @v_shuffle_v4p0_v4p0__7_6_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -9863,14 +9528,13 @@ define void @v_shuffle_v4p0_v4p0__7_7_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9880,14 +9544,13 @@ define void @v_shuffle_v4p0_v4p0__7_7_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9897,14 +9560,13 @@ define void @v_shuffle_v4p0_v4p0__7_7_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 ; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -9918,42 +9580,42 @@ define void @v_shuffle_v4p0_v4p0__7_7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_u_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_u_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -9968,18 +9630,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v2 +; GFX900-NEXT: v_mov_b32_e32 v11, v3 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9987,18 +9650,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10006,19 +9670,20 @@ define void @v_shuffle_v4p0_v4p0__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -10033,18 +9698,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v2 +; GFX900-NEXT: v_mov_b32_e32 v11, v3 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10052,18 +9718,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10071,19 +9738,20 @@ define void @v_shuffle_v4p0_v4p0__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -10098,18 +9766,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v2 +; GFX900-NEXT: v_mov_b32_e32 v11, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10117,18 +9786,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_2_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10136,19 +9806,20 @@ define void @v_shuffle_v4p0_v4p0__7_7_2_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -10163,18 +9834,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, v14 +; GFX900-NEXT: v_mov_b32_e32 v17, v15 +; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10184,16 +9855,16 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: v_mov_b32_e32 v16, v14 +; GFX90A-NEXT: v_mov_b32_e32 v17, v15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v6 -; GFX90A-NEXT: v_mov_b32_e32 v9, v7 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10203,16 +9874,16 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v10 +; GFX942-NEXT: v_mov_b32_e32 v9, v11 +; GFX942-NEXT: v_mov_b32_e32 v16, v14 +; GFX942-NEXT: v_mov_b32_e32 v17, v15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v8, v6 -; GFX942-NEXT: v_mov_b32_e32 v9, v7 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -10226,42 +9897,42 @@ define void @v_shuffle_v4p0_v4p0__7_7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -10278,13 +9949,13 @@ define void @v_shuffle_v4p0_v4p0__7_7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10294,13 +9965,13 @@ define void @v_shuffle_v4p0_v4p0__7_7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10310,13 +9981,13 @@ define void @v_shuffle_v4p0_v4p0__7_7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -10578,15 +10249,15 @@ define void @v_shuffle_v4p0_v4p0__3_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v12 +; GFX900-NEXT: v_mov_b32_e32 v15, v13 +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, v12 -; GFX900-NEXT: v_mov_b32_e32 v15, v13 -; GFX900-NEXT: v_mov_b32_e32 v10, v6 -; GFX900-NEXT: v_mov_b32_e32 v11, v7 ; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10597,15 +10268,15 @@ define void @v_shuffle_v4p0_v4p0__3_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v12 +; GFX90A-NEXT: v_mov_b32_e32 v15, v13 +; GFX90A-NEXT: v_mov_b32_e32 v8, v12 +; GFX90A-NEXT: v_mov_b32_e32 v9, v13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, v12 -; GFX90A-NEXT: v_mov_b32_e32 v15, v13 -; GFX90A-NEXT: v_mov_b32_e32 v10, v6 -; GFX90A-NEXT: v_mov_b32_e32 v11, v7 ; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10616,15 +10287,15 @@ define void @v_shuffle_v4p0_v4p0__3_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v12 +; GFX942-NEXT: v_mov_b32_e32 v15, v13 +; GFX942-NEXT: v_mov_b32_e32 v8, v12 +; GFX942-NEXT: v_mov_b32_e32 v9, v13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, v12 -; GFX942-NEXT: v_mov_b32_e32 v15, v13 -; GFX942-NEXT: v_mov_b32_e32 v10, v6 -; GFX942-NEXT: v_mov_b32_e32 v11, v7 ; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -10794,14 +10465,13 @@ define void @v_shuffle_v4p0_v4p0__7_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10811,14 +10481,13 @@ define void @v_shuffle_v4p0_v4p0__7_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10828,14 +10497,13 @@ define void @v_shuffle_v4p0_v4p0__7_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -10855,10 +10523,8 @@ define void @v_shuffle_v4p0_v4p0__7_u_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 ; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10871,10 +10537,8 @@ define void @v_shuffle_v4p0_v4p0__7_u_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 ; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10887,10 +10551,8 @@ define void @v_shuffle_v4p0_v4p0__7_u_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 ; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -10907,18 +10569,14 @@ define void @v_shuffle_v4p0_v4p0__7_0_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10928,18 +10586,14 @@ define void @v_shuffle_v4p0_v4p0__7_0_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10949,19 +10603,14 @@ define void @v_shuffle_v4p0_v4p0__7_0_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -10976,18 +10625,16 @@ define void @v_shuffle_v4p0_v4p0__7_1_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10995,18 +10642,16 @@ define void @v_shuffle_v4p0_v4p0__7_1_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11014,19 +10659,17 @@ define void @v_shuffle_v4p0_v4p0__7_1_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -11041,18 +10684,16 @@ define void @v_shuffle_v4p0_v4p0__7_2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11060,18 +10701,16 @@ define void @v_shuffle_v4p0_v4p0__7_2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11079,19 +10718,17 @@ define void @v_shuffle_v4p0_v4p0__7_2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -11106,18 +10743,16 @@ define void @v_shuffle_v4p0_v4p0__7_3_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11125,18 +10760,16 @@ define void @v_shuffle_v4p0_v4p0__7_3_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11144,18 +10777,17 @@ define void @v_shuffle_v4p0_v4p0__7_3_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -11172,16 +10804,13 @@ define void @v_shuffle_v4p0_v4p0__7_4_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11191,16 +10820,13 @@ define void @v_shuffle_v4p0_v4p0__7_4_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11210,16 +10836,13 @@ define void @v_shuffle_v4p0_v4p0__7_4_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -11233,54 +10856,56 @@ define void @v_shuffle_v4p0_v4p0__7_5_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_5_6_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: v_mov_b32_e32 v8, v4 ; GFX900-NEXT: v_mov_b32_e32 v9, v5 ; GFX900-NEXT: v_mov_b32_e32 v10, v4 ; GFX900-NEXT: v_mov_b32_e32 v11, v5 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 ; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_5_6_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v4 -; GFX90A-NEXT: v_mov_b32_e32 v11, v5 ; GFX90A-NEXT: v_mov_b32_e32 v12, v4 ; GFX90A-NEXT: v_mov_b32_e32 v13, v5 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v14, v4 +; GFX90A-NEXT: v_mov_b32_e32 v15, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[12:15], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_5_6_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v4 -; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v12, v4 ; GFX942-NEXT: v_mov_b32_e32 v13, v5 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v14, v4 +; GFX942-NEXT: v_mov_b32_e32 v15, v5 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[12:15], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -11297,14 +10922,13 @@ define void @v_shuffle_v4p0_v4p0__7_7_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11314,14 +10938,13 @@ define void @v_shuffle_v4p0_v4p0__7_7_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11331,14 +10954,13 @@ define void @v_shuffle_v4p0_v4p0__7_7_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -11352,45 +10974,42 @@ define void @v_shuffle_v4p0_v4p0__7_7_u_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_u_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_u_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_u_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -11405,18 +11024,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11424,18 +11044,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11443,19 +11064,20 @@ define void @v_shuffle_v4p0_v4p0__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -11470,18 +11092,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v8 -; GFX900-NEXT: v_mov_b32_e32 v5, v9 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11489,18 +11112,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v8 -; GFX90A-NEXT: v_mov_b32_e32 v5, v9 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11508,19 +11132,20 @@ define void @v_shuffle_v4p0_v4p0__7_7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v8 -; GFX942-NEXT: v_mov_b32_e32 v5, v9 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -11535,18 +11160,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_2_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v10 -; GFX900-NEXT: v_mov_b32_e32 v7, v11 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11554,18 +11180,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_2_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v10 -; GFX90A-NEXT: v_mov_b32_e32 v7, v11 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11573,19 +11200,20 @@ define void @v_shuffle_v4p0_v4p0__7_7_2_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v10 -; GFX942-NEXT: v_mov_b32_e32 v7, v11 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -11600,19 +11228,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v6 -; GFX900-NEXT: v_mov_b32_e32 v11, v7 -; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, v14 +; GFX900-NEXT: v_mov_b32_e32 v17, v15 +; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11622,17 +11249,16 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v12 +; GFX90A-NEXT: v_mov_b32_e32 v9, v13 +; GFX90A-NEXT: v_mov_b32_e32 v16, v14 +; GFX90A-NEXT: v_mov_b32_e32 v17, v15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, v6 -; GFX90A-NEXT: v_mov_b32_e32 v11, v7 -; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11642,18 +11268,16 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v12 +; GFX942-NEXT: v_mov_b32_e32 v9, v13 +; GFX942-NEXT: v_mov_b32_e32 v16, v14 +; GFX942-NEXT: v_mov_b32_e32 v17, v15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v6 -; GFX942-NEXT: v_mov_b32_e32 v11, v7 -; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -11670,13 +11294,13 @@ define void @v_shuffle_v4p0_v4p0__7_7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11686,13 +11310,13 @@ define void @v_shuffle_v4p0_v4p0__7_7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11702,13 +11326,13 @@ define void @v_shuffle_v4p0_v4p0__7_7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -11722,45 +11346,42 @@ define void @v_shuffle_v4p0_v4p0__7_7_5_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_5_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_5_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_5_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -11774,42 +11395,43 @@ define void @v_shuffle_v4p0_v4p0__u_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__u_7_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__u_7_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__u_7_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -11824,18 +11446,18 @@ define void @v_shuffle_v4p0_v4p0__0_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[10:17] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v18, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v18, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11843,18 +11465,18 @@ define void @v_shuffle_v4p0_v4p0__0_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[10:17] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v6 +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11862,19 +11484,19 @@ define void @v_shuffle_v4p0_v4p0__0_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[10:17] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: v_mov_b32_e32 v12, v6 +; GFX942-NEXT: v_mov_b32_e32 v13, v7 +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v18, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -11889,18 +11511,18 @@ define void @v_shuffle_v4p0_v4p0__1_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: v_mov_b32_e32 v4, v10 -; GFX900-NEXT: v_mov_b32_e32 v5, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11908,18 +11530,18 @@ define void @v_shuffle_v4p0_v4p0__1_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: v_mov_b32_e32 v4, v10 -; GFX90A-NEXT: v_mov_b32_e32 v5, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v6 +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11927,19 +11549,19 @@ define void @v_shuffle_v4p0_v4p0__1_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: v_mov_b32_e32 v4, v10 -; GFX942-NEXT: v_mov_b32_e32 v5, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: v_mov_b32_e32 v12, v6 +; GFX942-NEXT: v_mov_b32_e32 v13, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -11954,18 +11576,18 @@ define void @v_shuffle_v4p0_v4p0__2_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: v_mov_b32_e32 v6, v12 -; GFX900-NEXT: v_mov_b32_e32 v7, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11973,18 +11595,18 @@ define void @v_shuffle_v4p0_v4p0__2_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: v_mov_b32_e32 v6, v12 -; GFX90A-NEXT: v_mov_b32_e32 v7, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v6 +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11992,19 +11614,19 @@ define void @v_shuffle_v4p0_v4p0__2_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: v_mov_b32_e32 v6, v12 -; GFX942-NEXT: v_mov_b32_e32 v7, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: v_mov_b32_e32 v12, v6 +; GFX942-NEXT: v_mov_b32_e32 v13, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -12018,20 +11640,18 @@ define void @v_shuffle_v4p0_v4p0__3_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__3_7_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v12, v6 -; GFX900-NEXT: v_mov_b32_e32 v13, v7 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12041,17 +11661,16 @@ define void @v_shuffle_v4p0_v4p0__3_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, v14 +; GFX90A-NEXT: v_mov_b32_e32 v17, v15 +; GFX90A-NEXT: v_mov_b32_e32 v8, v14 +; GFX90A-NEXT: v_mov_b32_e32 v9, v15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, v6 -; GFX90A-NEXT: v_mov_b32_e32 v13, v7 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12061,17 +11680,16 @@ define void @v_shuffle_v4p0_v4p0__3_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v14 +; GFX942-NEXT: v_mov_b32_e32 v17, v15 +; GFX942-NEXT: v_mov_b32_e32 v8, v14 +; GFX942-NEXT: v_mov_b32_e32 v9, v15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v12, v6 -; GFX942-NEXT: v_mov_b32_e32 v13, v7 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -12088,13 +11706,13 @@ define void @v_shuffle_v4p0_v4p0__4_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: v_mov_b32_e32 v2, v6 ; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12104,13 +11722,13 @@ define void @v_shuffle_v4p0_v4p0__4_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: v_mov_b32_e32 v2, v6 ; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12120,13 +11738,13 @@ define void @v_shuffle_v4p0_v4p0__4_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: v_mov_b32_e32 v2, v6 ; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -12143,11 +11761,13 @@ define void @v_shuffle_v4p0_v4p0__5_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12157,11 +11777,13 @@ define void @v_shuffle_v4p0_v4p0__5_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: v_mov_b32_e32 v4, v6 ; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12171,11 +11793,13 @@ define void @v_shuffle_v4p0_v4p0__5_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 ; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -12189,48 +11813,43 @@ define void @v_shuffle_v4p0_v4p0__6_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__6_7_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__6_7_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__6_7_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -12244,48 +11863,43 @@ define void @v_shuffle_v4p0_v4p0__7_u_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_u_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_u_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_u_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -12299,64 +11913,61 @@ define void @v_shuffle_v4p0_v4p0__7_0_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_0_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v18, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: v_mov_b32_e32 v8, v6 ; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[10:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_0_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[10:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_0_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v18, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[10:17] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v10 +; GFX942-NEXT: v_mov_b32_e32 v9, v11 +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -12371,18 +11982,19 @@ define void @v_shuffle_v4p0_v4p0__7_1_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v8, v10 ; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: v_mov_b32_e32 v0, v10 -; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12390,18 +12002,19 @@ define void @v_shuffle_v4p0_v4p0__7_1_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v8, v10 ; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: v_mov_b32_e32 v0, v10 -; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12409,19 +12022,20 @@ define void @v_shuffle_v4p0_v4p0__7_1_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mov_b32_e32 v8, v10 ; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: v_mov_b32_e32 v0, v10 -; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -12436,18 +12050,19 @@ define void @v_shuffle_v4p0_v4p0__7_2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12455,18 +12070,19 @@ define void @v_shuffle_v4p0_v4p0__7_2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: v_mov_b32_e32 v2, v12 -; GFX90A-NEXT: v_mov_b32_e32 v3, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12474,19 +12090,20 @@ define void @v_shuffle_v4p0_v4p0__7_2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: v_mov_b32_e32 v2, v12 -; GFX942-NEXT: v_mov_b32_e32 v3, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v10 +; GFX942-NEXT: v_mov_b32_e32 v9, v11 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -12501,18 +12118,19 @@ define void @v_shuffle_v4p0_v4p0__7_3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: v_mov_b32_e32 v4, v14 -; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12520,18 +12138,19 @@ define void @v_shuffle_v4p0_v4p0__7_3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: v_mov_b32_e32 v4, v14 -; GFX90A-NEXT: v_mov_b32_e32 v5, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12539,18 +12158,20 @@ define void @v_shuffle_v4p0_v4p0__7_3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[8:15] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: v_mov_b32_e32 v4, v14 -; GFX942-NEXT: v_mov_b32_e32 v5, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v10 +; GFX942-NEXT: v_mov_b32_e32 v9, v11 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -12564,54 +12185,52 @@ define void @v_shuffle_v4p0_v4p0__7_4_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_4_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_4_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_4_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -12625,48 +12244,52 @@ define void @v_shuffle_v4p0_v4p0__7_5_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_5_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_5_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_5_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -12680,48 +12303,52 @@ define void @v_shuffle_v4p0_v4p0__7_6_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_6_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_6_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_6_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -12735,45 +12362,42 @@ define void @v_shuffle_v4p0_v4p0__7_7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_u_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_u_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_u_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -12788,18 +12412,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_0_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v8 -; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v6, v8 -; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12807,18 +12432,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_0_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v8 -; GFX90A-NEXT: v_mov_b32_e32 v3, v9 -; GFX90A-NEXT: v_mov_b32_e32 v6, v8 -; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v6 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12826,19 +12452,20 @@ define void @v_shuffle_v4p0_v4p0__7_7_0_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:9] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v8 -; GFX942-NEXT: v_mov_b32_e32 v3, v9 -; GFX942-NEXT: v_mov_b32_e32 v6, v8 -; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v6 +; GFX942-NEXT: v_mov_b32_e32 v11, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -12853,18 +12480,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_1_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v10 -; GFX900-NEXT: v_mov_b32_e32 v5, v11 -; GFX900-NEXT: v_mov_b32_e32 v8, v10 -; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12872,18 +12500,19 @@ define void @v_shuffle_v4p0_v4p0__7_7_1_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v10 -; GFX90A-NEXT: v_mov_b32_e32 v5, v11 -; GFX90A-NEXT: v_mov_b32_e32 v8, v10 -; GFX90A-NEXT: v_mov_b32_e32 v9, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v6 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12891,19 +12520,20 @@ define void @v_shuffle_v4p0_v4p0__7_7_1_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[6:13] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:11] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v10 -; GFX942-NEXT: v_mov_b32_e32 v5, v11 -; GFX942-NEXT: v_mov_b32_e32 v8, v10 -; GFX942-NEXT: v_mov_b32_e32 v9, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v6 +; GFX942-NEXT: v_mov_b32_e32 v11, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -12918,37 +12548,39 @@ define void @v_shuffle_v4p0_v4p0__7_7_2_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 -; GFX900-NEXT: v_mov_b32_e32 v6, v12 -; GFX900-NEXT: v_mov_b32_e32 v7, v13 -; GFX900-NEXT: v_mov_b32_e32 v10, v12 -; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] -; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_2_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v12 -; GFX90A-NEXT: v_mov_b32_e32 v7, v13 -; GFX90A-NEXT: v_mov_b32_e32 v10, v12 -; GFX90A-NEXT: v_mov_b32_e32 v11, v13 -; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v6 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12956,19 +12588,20 @@ define void @v_shuffle_v4p0_v4p0__7_7_2_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:7] +; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[6:13] +; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v12 -; GFX942-NEXT: v_mov_b32_e32 v7, v13 -; GFX942-NEXT: v_mov_b32_e32 v10, v12 -; GFX942-NEXT: v_mov_b32_e32 v11, v13 -; GFX942-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v10, v6 +; GFX942-NEXT: v_mov_b32_e32 v11, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -12983,19 +12616,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v6 -; GFX900-NEXT: v_mov_b32_e32 v13, v7 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v12, v14 -; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v14 +; GFX900-NEXT: v_mov_b32_e32 v9, v15 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, v14 +; GFX900-NEXT: v_mov_b32_e32 v17, v15 +; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -13005,17 +12637,16 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v14 +; GFX90A-NEXT: v_mov_b32_e32 v9, v15 +; GFX90A-NEXT: v_mov_b32_e32 v16, v14 +; GFX90A-NEXT: v_mov_b32_e32 v17, v15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, v6 -; GFX90A-NEXT: v_mov_b32_e32 v13, v7 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, v14 -; GFX90A-NEXT: v_mov_b32_e32 v13, v15 -; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -13025,18 +12656,16 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v14 +; GFX942-NEXT: v_mov_b32_e32 v9, v15 +; GFX942-NEXT: v_mov_b32_e32 v16, v14 +; GFX942-NEXT: v_mov_b32_e32 v17, v15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v12, v6 -; GFX942-NEXT: v_mov_b32_e32 v13, v7 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v12, v14 -; GFX942-NEXT: v_mov_b32_e32 v13, v15 -; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -13053,13 +12682,13 @@ define void @v_shuffle_v4p0_v4p0__7_7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v6 ; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -13069,13 +12698,13 @@ define void @v_shuffle_v4p0_v4p0__7_7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v6 ; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -13085,13 +12714,13 @@ define void @v_shuffle_v4p0_v4p0__7_7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v6 ; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -13108,11 +12737,13 @@ define void @v_shuffle_v4p0_v4p0__7_7_5_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -13122,11 +12753,13 @@ define void @v_shuffle_v4p0_v4p0__7_7_5_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v6 ; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -13136,11 +12769,13 @@ define void @v_shuffle_v4p0_v4p0__7_7_5_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -13154,45 +12789,42 @@ define void @v_shuffle_v4p0_v4p0__7_7_6_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_6_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_6_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v4p0__7_7_6_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -13328,10 +12960,9 @@ define void @s_shuffle_v4p0_v4p0__2_u_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -13474,10 +13105,9 @@ define void @s_shuffle_v4p0_v4p0__6_u_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -13750,15 +13380,14 @@ define void @s_shuffle_v4p0_v4p0__7_3_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -13895,242 +13524,435 @@ define void @s_shuffle_v4p0_v4p0__7_6_u_u() { } define void @s_shuffle_v4p0_v4p0__7_7_u_u() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_u: +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_0_u() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_0_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_1_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_u: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_u: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_1_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_0_u() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_u: +define void @s_shuffle_v4p0_v4p0__7_7_2_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_u: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_0_u: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s10 ; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_1_u() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_u: +define void @s_shuffle_v4p0_v4p0__7_7_3_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_u: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_1_u: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: s_mov_b32 s12, s14 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s10 ; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_2_u() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_u: +define void @s_shuffle_v4p0_v4p0__7_7_4_u() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_4_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_5_u() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_5_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_6_u() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_6_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s10, s14 +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_7_u() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_7_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_7_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_7_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_u: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_7_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_u: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_7_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_3_u() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_u: +define void @s_shuffle_v4p0_v4p0__7_7_7_1() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_7_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_7_2() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_7_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_7_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_7_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -14141,14 +13963,14 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_u() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_u: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_7_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -14159,3681 +13981,180 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_u() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_3_u: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_7_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_4_u() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_u: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_u: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_u: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_7_5_u() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_u: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_u: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_u: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_7_6_u() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_6_u: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s14 -; GFX9-NEXT: s_mov_b32 s9, s15 -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_7_7_u() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_7_u: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_7_u: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_7_u: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_7_7_0() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_7_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_7_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_7_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_7_7_1() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_7_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_7_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_7_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_7_7_2() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_7_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_7_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_7_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_7_7_3() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_7_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s10, s22 -; GFX900-NEXT: s_mov_b32 s11, s23 -; GFX900-NEXT: s_mov_b32 s12, s22 -; GFX900-NEXT: s_mov_b32 s13, s23 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_7_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s10, s22 -; GFX90A-NEXT: s_mov_b32 s11, s23 -; GFX90A-NEXT: s_mov_b32 s12, s22 -; GFX90A-NEXT: s_mov_b32 s13, s23 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_7_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_7_7_4() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_7_4: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s10, s22 -; GFX900-NEXT: s_mov_b32 s11, s23 -; GFX900-NEXT: s_mov_b32 s12, s22 -; GFX900-NEXT: s_mov_b32 s13, s23 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_7_4: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s10, s22 -; GFX90A-NEXT: s_mov_b32 s11, s23 -; GFX90A-NEXT: s_mov_b32 s12, s22 -; GFX90A-NEXT: s_mov_b32 s13, s23 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_7_4: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_7_7_5() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_7_5: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s12, s18 -; GFX900-NEXT: s_mov_b32 s13, s19 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_7_5: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s12, s18 -; GFX90A-NEXT: s_mov_b32 s13, s19 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_7_5: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_7_7_6() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_7_6: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s12, s18 -; GFX900-NEXT: s_mov_b32 s13, s19 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_7_6: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s12, s18 -; GFX90A-NEXT: s_mov_b32 s13, s19 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_7_6: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_7_7_7() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_7_7: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s14 -; GFX9-NEXT: s_mov_b32 s9, s15 -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 -; GFX9-NEXT: s_mov_b32 s12, s14 -; GFX9-NEXT: s_mov_b32 s13, s15 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__u_0_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__u_0_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__u_0_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__u_0_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__0_0_0_0() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__0_0_0_0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s11, s9 -; GFX9-NEXT: s_mov_b32 s12, s8 -; GFX9-NEXT: s_mov_b32 s13, s9 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_mov_b32 s15, s9 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> zeroinitializer - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__1_0_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_0_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_0_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__1_0_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__2_0_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_0_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_0_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__2_0_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__3_0_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_0_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_0_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_0_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__4_0_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__4_0_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__4_0_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__4_0_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__5_0_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_0_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_0_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_0_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__6_0_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_0_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_0_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_0_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_0_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_u_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_1_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s6 -; GFX900-NEXT: s_mov_b32 s11, s7 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s6 -; GFX90A-NEXT: s_mov_b32 s11, s7 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s2 -; GFX942-NEXT: s_mov_b32 s11, s3 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_2_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_3_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_3_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_4_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_5_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_6_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_7_0_0() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_7_u_0() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_7_1_0() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_1_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_7_2_0() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s20 -; GFX900-NEXT: s_mov_b32 s13, s21 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s20 -; GFX90A-NEXT: s_mov_b32 s13, s21 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_7_3_0() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s22 -; GFX900-NEXT: s_mov_b32 s13, s23 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s22 -; GFX90A-NEXT: s_mov_b32 s13, s23 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_3_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_7_4_0() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_7_5_0() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_7_6_0() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__u_1_1_1() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__u_1_1_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__0_1_1_1() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__0_1_1_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__1_1_1_1() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__1_1_1_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__2_1_1_1() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__2_1_1_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__3_1_1_1() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__3_1_1_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s14 -; GFX9-NEXT: s_mov_b32 s9, s15 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__4_1_1_1() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__4_1_1_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__5_1_1_1() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_1_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 -; GFX900-NEXT: s_mov_b32 s14, s10 -; GFX900-NEXT: s_mov_b32 s15, s11 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_1_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 -; GFX90A-NEXT: s_mov_b32 s14, s10 -; GFX90A-NEXT: s_mov_b32 s15, s11 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_1_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 -; GFX942-NEXT: s_mov_b32 s14, s10 -; GFX942-NEXT: s_mov_b32 s15, s11 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__6_1_1_1() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_1_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 -; GFX900-NEXT: s_mov_b32 s14, s10 -; GFX900-NEXT: s_mov_b32 s15, s11 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_1_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 -; GFX90A-NEXT: s_mov_b32 s14, s10 -; GFX90A-NEXT: s_mov_b32 s15, s11 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_1_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 -; GFX942-NEXT: s_mov_b32 s14, s10 -; GFX942-NEXT: s_mov_b32 s15, s11 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_1_1_1() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 -; GFX900-NEXT: s_mov_b32 s14, s10 -; GFX900-NEXT: s_mov_b32 s15, s11 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 -; GFX90A-NEXT: s_mov_b32 s14, s10 -; GFX90A-NEXT: s_mov_b32 s15, s11 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 -; GFX942-NEXT: s_mov_b32 s14, s10 -; GFX942-NEXT: s_mov_b32 s15, s11 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_u_1_1() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_0_1_1() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_2_1_1() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_3_1_1() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_3_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_4_1_1() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_5_1_1() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_6_1_1() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_7_1_1() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_1_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_7_u_1() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_7_0_1() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_0_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_7_2_1() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_7_3_1() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s18 -; GFX900-NEXT: s_mov_b32 s13, s19 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s18 -; GFX90A-NEXT: s_mov_b32 s13, s19 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_3_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4p0_v4p0__7_7_7_4() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_7_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_4_1() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4p0_v4p0__7_7_7_5() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_7_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_5_1() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_1: +define void @s_shuffle_v4p0_v4p0__7_7_7_6() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_7_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s18 ; GFX900-NEXT: s_mov_b32 s9, s19 ; GFX900-NEXT: s_mov_b32 s10, s18 ; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_1: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_7_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s18 ; GFX90A-NEXT: s_mov_b32 s9, s19 ; GFX90A-NEXT: s_mov_b32 s10, s18 ; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_1: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_7_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s6 +; GFX942-NEXT: s_mov_b32 s11, s7 ; GFX942-NEXT: s_mov_b32 s12, s6 ; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_6_1() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4p0_v4p0__7_7_7_7() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_7_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__u_2_2_2() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__u_2_2_2: +define void @s_shuffle_v4p0_v4p0__u_0_0_0() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__u_0_0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[12:19] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 @@ -17844,41 +14165,43 @@ define void @s_shuffle_v4p0_v4p0__u_2_2_2() { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__0_2_2_2() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__0_2_2_2: +define void @s_shuffle_v4p0_v4p0__0_0_0_0() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__0_0_0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:15] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s15, s9 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> zeroinitializer call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__1_2_2_2() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__1_2_2_2: +define void @s_shuffle_v4p0_v4p0__1_0_0_0() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__1_0_0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[12:19] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: s_mov_b32 s14, s12 @@ -17888,43 +14211,43 @@ define void @s_shuffle_v4p0_v4p0__1_2_2_2() { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__2_2_2_2() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__2_2_2_2: +define void @s_shuffle_v4p0_v4p0__2_0_0_0() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__2_0_0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__3_2_2_2() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__3_2_2_2: +define void @s_shuffle_v4p0_v4p0__3_0_0_0() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__3_0_0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[12:19] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s14 -; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: s_mov_b32 s14, s12 @@ -17934,17 +14257,17 @@ define void @s_shuffle_v4p0_v4p0__3_2_2_2() { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__4_2_2_2() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__4_2_2_2: +define void @s_shuffle_v4p0_v4p0__4_0_0_0() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__4_0_0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[12:19] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 @@ -17955,17 +14278,17 @@ define void @s_shuffle_v4p0_v4p0__4_2_2_2() { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__5_2_2_2() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_2_2_2: +define void @s_shuffle_v4p0_v4p0__5_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_0_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] @@ -17981,11 +14304,11 @@ define void @s_shuffle_v4p0_v4p0__5_2_2_2() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_2_2_2: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] @@ -18001,11 +14324,11 @@ define void @s_shuffle_v4p0_v4p0__5_2_2_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_2_2_2: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -18022,21 +14345,48 @@ define void @s_shuffle_v4p0_v4p0__5_2_2_2() { ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__6_2_2_2() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_2_2_2: +define void @s_shuffle_v4p0_v4p0__6_0_0_0() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__6_0_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: s_mov_b32 s10, s12 ; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: s_mov_b32 s14, s12 @@ -18046,15 +14396,17 @@ define void @s_shuffle_v4p0_v4p0__6_2_2_2() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_2_2_2: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: s_mov_b32 s10, s12 ; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: s_mov_b32 s14, s12 @@ -18064,17 +14416,17 @@ define void @s_shuffle_v4p0_v4p0__6_2_2_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_2_2_2: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 ; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: s_mov_b32 s14, s12 @@ -18085,25 +14437,23 @@ define void @s_shuffle_v4p0_v4p0__6_2_2_2() { ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_2_2_2() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_2_2: +define void @s_shuffle_v4p0_v4p0__7_u_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: s_mov_b32 s14, s12 ; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART @@ -18111,19 +14461,17 @@ define void @s_shuffle_v4p0_v4p0__7_2_2_2() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_2_2: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: s_mov_b32 s14, s12 ; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART @@ -18131,19 +14479,17 @@ define void @s_shuffle_v4p0_v4p0__7_2_2_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_2_2: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: s_mov_b32 s14, s12 ; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART @@ -18152,23 +14498,186 @@ define void @s_shuffle_v4p0_v4p0__7_2_2_2() { ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_1_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_0_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_2_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_0_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s16 +; GFX942-NEXT: s_mov_b32 s11, s17 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_3_0_0() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_3_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_u_2_2() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_2_2: +define void @s_shuffle_v4p0_v4p0__7_4_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: s_mov_b32 s14, s12 ; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART @@ -18176,17 +14685,19 @@ define void @s_shuffle_v4p0_v4p0__7_u_2_2() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_2_2: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: s_mov_b32 s14, s12 ; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART @@ -18194,17 +14705,19 @@ define void @s_shuffle_v4p0_v4p0__7_u_2_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_2_2: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 ; GFX942-NEXT: s_mov_b32 s14, s12 ; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART @@ -18213,57 +14726,53 @@ define void @s_shuffle_v4p0_v4p0__7_u_2_2() { ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_0_2_2() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_2_2: +define void @s_shuffle_v4p0_v4p0__7_5_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_2_2: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_2_2: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -18274,35 +14783,35 @@ define void @s_shuffle_v4p0_v4p0__7_0_2_2() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s14 ; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: s_mov_b32 s12, s0 +; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s14, s0 +; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_1_2_2() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_2_2: +define void @s_shuffle_v4p0_v4p0__7_6_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s22 ; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 ; GFX900-NEXT: s_mov_b32 s14, s12 ; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART @@ -18310,17 +14819,19 @@ define void @s_shuffle_v4p0_v4p0__7_1_2_2() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_2_2: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s22 ; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 ; GFX90A-NEXT: s_mov_b32 s14, s12 ; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART @@ -18328,17 +14839,19 @@ define void @s_shuffle_v4p0_v4p0__7_1_2_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_2_2: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 ; GFX942-NEXT: s_mov_b32 s14, s12 ; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART @@ -18347,25 +14860,48 @@ define void @s_shuffle_v4p0_v4p0__7_1_2_2() { ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_3_2_2() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_2_2: +define void @s_shuffle_v4p0_v4p0__7_7_0_0() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_u_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 ; GFX900-NEXT: s_mov_b32 s14, s12 ; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART @@ -18373,19 +14909,17 @@ define void @s_shuffle_v4p0_v4p0__7_3_2_2() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_2_2: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 ; GFX90A-NEXT: s_mov_b32 s14, s12 ; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART @@ -18393,113 +14927,165 @@ define void @s_shuffle_v4p0_v4p0__7_3_2_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_3_2_2: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_4_2_2() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_2_2: +define void @s_shuffle_v4p0_v4p0__7_7_1_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_2_2: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_2_2: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_1_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s14, s0 +; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_2_0() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_2_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[16:23] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s20 +; GFX9-NEXT: s_mov_b32 s13, s21 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_3_0() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_3_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[16:23] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s22 +; GFX9-NEXT: s_mov_b32 s13, s23 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_5_2_2() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_2_2: +define void @s_shuffle_v4p0_v4p0__7_7_4_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: s_mov_b32 s14, s16 ; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART @@ -18507,19 +15093,19 @@ define void @s_shuffle_v4p0_v4p0__7_5_2_2() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_2_2: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: s_mov_b32 s14, s16 ; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART @@ -18527,137 +15113,141 @@ define void @s_shuffle_v4p0_v4p0__7_5_2_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_2_2: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 ; GFX942-NEXT: s_mov_b32 s12, s4 ; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_6_2_2() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_2_2: +define void @s_shuffle_v4p0_v4p0__7_7_5_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_2_2: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_2_2: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_2_2() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_2: +define void @s_shuffle_v4p0_v4p0__7_7_6_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_2: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_2: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -18666,124 +15256,197 @@ define void @s_shuffle_v4p0_v4p0__7_7_2_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s14, s0 +; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__u_1_1_1() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__u_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__0_1_1_1() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__0_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__1_1_1_1() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__1_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__2_1_1_1() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__2_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__3_1_1_1() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__3_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_u_2() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4p0_v4p0__4_1_1_1() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__4_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_0_2() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_2: +define void @s_shuffle_v4p0_v4p0__5_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_2: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_1_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_0_2: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_1_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -18792,165 +15455,117 @@ define void @s_shuffle_v4p0_v4p0__7_7_0_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_1_2() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_1_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4p0_v4p0__6_1_1_1() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__6_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s14 +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_3_2() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_2: +define void @s_shuffle_v4p0_v4p0__7_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s18 -; GFX900-NEXT: s_mov_b32 s13, s19 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_2: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s18 -; GFX90A-NEXT: s_mov_b32 s13, s19 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_3_2: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_4_2() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_2: +define void @s_shuffle_v4p0_v4p0__7_u_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -18961,16 +15576,14 @@ define void @s_shuffle_v4p0_v4p0__7_7_4_2() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_2: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -18981,16 +15594,14 @@ define void @s_shuffle_v4p0_v4p0__7_7_4_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_2: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -18999,25 +15610,23 @@ define void @s_shuffle_v4p0_v4p0__7_7_4_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s18 -; GFX942-NEXT: s_mov_b32 s9, s19 -; GFX942-NEXT: s_mov_b32 s10, s18 -; GFX942-NEXT: s_mov_b32 s11, s19 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_5_2() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_2: +define void @s_shuffle_v4p0_v4p0__7_0_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -19028,16 +15637,16 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_2() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_2: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -19048,16 +15657,16 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_2: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -19066,238 +15675,131 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s18 -; GFX942-NEXT: s_mov_b32 s9, s19 -; GFX942-NEXT: s_mov_b32 s10, s18 -; GFX942-NEXT: s_mov_b32 s11, s19 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: s_mov_b32 s12, s14 ; GFX942-NEXT: s_mov_b32 s13, s15 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_6_2() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_2: +define void @s_shuffle_v4p0_v4p0__7_2_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_2: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_2: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s16 +; GFX942-NEXT: s_mov_b32 s11, s17 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__u_3_3_3() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__u_3_3_3: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 -; GFX9-NEXT: s_mov_b32 s12, s14 -; GFX9-NEXT: s_mov_b32 s13, s15 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__0_3_3_3() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__0_3_3_3: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 -; GFX9-NEXT: s_mov_b32 s12, s14 -; GFX9-NEXT: s_mov_b32 s13, s15 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__1_3_3_3() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__1_3_3_3: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 -; GFX9-NEXT: s_mov_b32 s12, s14 -; GFX9-NEXT: s_mov_b32 s13, s15 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__2_3_3_3() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__2_3_3_3: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 -; GFX9-NEXT: s_mov_b32 s12, s14 -; GFX9-NEXT: s_mov_b32 s13, s15 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__3_3_3_3() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__3_3_3_3: +define void @s_shuffle_v4p0_v4p0__7_3_1_1() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_3_1_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s14 -; GFX9-NEXT: s_mov_b32 s9, s15 -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 -; GFX9-NEXT: s_mov_b32 s12, s14 -; GFX9-NEXT: s_mov_b32 s13, s15 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__4_3_3_3() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__4_3_3_3: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[12:19] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 -; GFX9-NEXT: s_mov_b32 s12, s14 -; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__5_3_3_3() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_3_3_3: +define void @s_shuffle_v4p0_v4p0__7_4_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: s_mov_b32 s12, s14 ; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART @@ -19305,19 +15807,19 @@ define void @s_shuffle_v4p0_v4p0__5_3_3_3() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_3_3_3: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: s_mov_b32 s12, s14 ; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART @@ -19325,19 +15827,19 @@ define void @s_shuffle_v4p0_v4p0__5_3_3_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_3_3_3: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 ; GFX942-NEXT: s_mov_b32 s12, s14 ; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART @@ -19346,49 +15848,53 @@ define void @s_shuffle_v4p0_v4p0__5_3_3_3() { ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__6_3_3_3() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_3_3_3: +define void @s_shuffle_v4p0_v4p0__7_5_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_3_3_3: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_3_3_3: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -19397,37 +15903,37 @@ define void @s_shuffle_v4p0_v4p0__6_3_3_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s14, s2 +; GFX942-NEXT: s_mov_b32 s15, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_3_3_3() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_3_3: +define void @s_shuffle_v4p0_v4p0__7_6_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 ; GFX900-NEXT: s_mov_b32 s12, s14 ; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART @@ -19435,19 +15941,19 @@ define void @s_shuffle_v4p0_v4p0__7_3_3_3() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_3_3: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 ; GFX90A-NEXT: s_mov_b32 s12, s14 ; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART @@ -19455,19 +15961,19 @@ define void @s_shuffle_v4p0_v4p0__7_3_3_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_3_3_3: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 ; GFX942-NEXT: s_mov_b32 s12, s14 ; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART @@ -19476,49 +15982,224 @@ define void @s_shuffle_v4p0_v4p0__7_3_3_3() { ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_1_1() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_u_1() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_u_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_0_1() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_0_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_2_1() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_2_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s16 +; GFX9-NEXT: s_mov_b32 s13, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_3_1() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_3_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s18 +; GFX9-NEXT: s_mov_b32 s13, s19 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_4_1() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_4_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_5_1() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_5_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_u_3_3() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_3_3: +define void @s_shuffle_v4p0_v4p0__7_7_6_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_3_3: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_3_3: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -19527,67 +16208,197 @@ define void @s_shuffle_v4p0_v4p0__7_u_3_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s14, s2 +; GFX942-NEXT: s_mov_b32 s15, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_0_3_3() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_3_3: +define void @s_shuffle_v4p0_v4p0__u_2_2_2() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__u_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__0_2_2_2() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__0_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__1_2_2_2() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__1_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__2_2_2_2() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__2_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s15, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__3_2_2_2() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__3_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__4_2_2_2() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__4_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__5_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: s_mov_b32 s10, s12 ; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s18 -; GFX900-NEXT: s_mov_b32 s13, s19 -; GFX900-NEXT: s_mov_b32 s14, s18 -; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_3_3: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: s_mov_b32 s10, s12 ; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s18 -; GFX90A-NEXT: s_mov_b32 s13, s19 -; GFX90A-NEXT: s_mov_b32 s14, s18 -; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_3_3: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -19596,88 +16407,86 @@ define void @s_shuffle_v4p0_v4p0__7_0_3_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: s_mov_b32 s14, s6 -; GFX942-NEXT: s_mov_b32 s15, s7 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_1_3_3() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_3_3: +define void @s_shuffle_v4p0_v4p0__6_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_3_3: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_3_3: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_2_3_3() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_3_3: +define void @s_shuffle_v4p0_v4p0__7_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -19690,14 +16499,14 @@ define void @s_shuffle_v4p0_v4p0__7_2_3_3() { ; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: s_mov_b32 s10, s12 ; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_3_3: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -19710,14 +16519,14 @@ define void @s_shuffle_v4p0_v4p0__7_2_3_3() { ; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: s_mov_b32 s10, s12 ; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_3_3: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -19730,21 +16539,21 @@ define void @s_shuffle_v4p0_v4p0__7_2_3_3() { ; GFX942-NEXT: s_mov_b32 s9, s7 ; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_4_3_3() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_3_3: +define void @s_shuffle_v4p0_v4p0__7_u_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -19755,16 +16564,14 @@ define void @s_shuffle_v4p0_v4p0__7_4_3_3() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_3_3: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -19775,16 +16582,14 @@ define void @s_shuffle_v4p0_v4p0__7_4_3_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_3_3: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -19795,63 +16600,65 @@ define void @s_shuffle_v4p0_v4p0__7_4_3_3() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_5_3_3() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_3_3: +define void @s_shuffle_v4p0_v4p0__7_0_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s12, s18 -; GFX900-NEXT: s_mov_b32 s13, s19 -; GFX900-NEXT: s_mov_b32 s14, s18 -; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_3_3: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s12, s18 -; GFX90A-NEXT: s_mov_b32 s13, s19 -; GFX90A-NEXT: s_mov_b32 s14, s18 -; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_3_3: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -19862,23 +16669,25 @@ define void @s_shuffle_v4p0_v4p0__7_5_3_3() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s14 ; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: s_mov_b32 s14, s6 -; GFX942-NEXT: s_mov_b32 s15, s7 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_6_3_3() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_3_3: +define void @s_shuffle_v4p0_v4p0__7_1_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -19889,16 +16698,14 @@ define void @s_shuffle_v4p0_v4p0__7_6_3_3() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s22 ; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_3_3: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -19909,16 +16716,14 @@ define void @s_shuffle_v4p0_v4p0__7_6_3_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s22 ; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_3_3: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -19929,23 +16734,21 @@ define void @s_shuffle_v4p0_v4p0__7_6_3_3() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_3_3() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_3: +define void @s_shuffle_v4p0_v4p0__7_3_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -19956,14 +16759,16 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_3() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_3: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -19974,14 +16779,16 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_3_3: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_3_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -19992,23 +16799,23 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_3() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_u_3() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_3: +define void @s_shuffle_v4p0_v4p0__7_4_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -20019,12 +16826,16 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_3() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_3: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -20035,12 +16846,16 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_3: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -20051,126 +16866,63 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_3() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_7_0_3() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_3: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s14, s18 -; GFX900-NEXT: s_mov_b32 s15, s19 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_3: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s14, s18 -; GFX90A-NEXT: s_mov_b32 s15, s19 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_0_3: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s6 -; GFX942-NEXT: s_mov_b32 s15, s7 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_1_3() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_3: +define void @s_shuffle_v4p0_v4p0__7_5_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: s_mov_b32 s14, s18 -; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_3: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: s_mov_b32 s14, s18 -; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_1_3: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -20181,57 +16933,63 @@ define void @s_shuffle_v4p0_v4p0__7_7_1_3() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s14 ; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s6 -; GFX942-NEXT: s_mov_b32 s15, s7 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_2_3() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_3: +define void @s_shuffle_v4p0_v4p0__7_6_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_3: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_3: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -20242,21 +17000,23 @@ define void @s_shuffle_v4p0_v4p0__7_7_2_3() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_4_3() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_3: +define void @s_shuffle_v4p0_v4p0__7_7_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -20267,14 +17027,14 @@ define void @s_shuffle_v4p0_v4p0__7_7_4_3() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_3: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -20285,41 +17045,39 @@ define void @s_shuffle_v4p0_v4p0__7_7_4_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_3: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_5_3() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_3: +define void @s_shuffle_v4p0_v4p0__7_7_u_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -20330,14 +17088,14 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_3() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_3: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -20348,41 +17106,172 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_3: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s12 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_6_3() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_3: +define void @s_shuffle_v4p0_v4p0__7_7_0_2() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_0_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_1_2() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_1_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_3_2() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_3_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s18 +; GFX9-NEXT: s_mov_b32 s13, s19 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_4_2() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_4_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_5_2() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_5_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_6_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -20395,14 +17284,14 @@ define void @s_shuffle_v4p0_v4p0__7_7_6_3() { ; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s10, s14 ; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s14, s18 -; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_3: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -20415,14 +17304,14 @@ define void @s_shuffle_v4p0_v4p0__7_7_6_3() { ; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s10, s14 ; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s14, s18 -; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_3: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -20435,580 +17324,648 @@ define void @s_shuffle_v4p0_v4p0__7_7_6_3() { ; GFX942-NEXT: s_mov_b32 s9, s15 ; GFX942-NEXT: s_mov_b32 s10, s14 ; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s14, s6 -; GFX942-NEXT: s_mov_b32 s15, s7 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__u_4_4_4() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__u_4_4_4: +define void @s_shuffle_v4p0_v4p0__u_3_3_3() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__u_3_3_3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__0_4_4_4() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_4_4_4: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_4_4_4: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__0_4_4_4: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4p0_v4p0__0_3_3_3() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__0_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s14 +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__1_4_4_4() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_4_4_4: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_4_4_4: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__1_4_4_4: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4p0_v4p0__1_3_3_3() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__1_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__2_4_4_4() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_4_4_4: +define void @s_shuffle_v4p0_v4p0__2_3_3_3() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__2_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__3_3_3_3() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__3_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__4_3_3_3() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__4_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__5_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_4_4_4: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__2_4_4_4: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__3_4_4_4() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_4_4_4: +define void @s_shuffle_v4p0_v4p0__6_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_4_4_4: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_4_4_4: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__4_4_4_4() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__4_4_4_4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__5_4_4_4() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_4_4_4: +define void @s_shuffle_v4p0_v4p0__7_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_4_4_4: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_4_4_4: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__6_4_4_4() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_4_4_4: +define void @s_shuffle_v4p0_v4p0__7_u_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_4_4_4: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_4_4_4: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_4_4_4() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_4_4: +define void @s_shuffle_v4p0_v4p0__7_0_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_4_4: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_4_4: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 ; GFX942-NEXT: s_mov_b32 s10, s0 ; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s14, s6 +; GFX942-NEXT: s_mov_b32 s15, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_u_4_4() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_4_4: +define void @s_shuffle_v4p0_v4p0__7_1_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_4_4: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_4_4: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_0_4_4() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_4_4: +define void @s_shuffle_v4p0_v4p0__7_2_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_4_4: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_4_4: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_1_4_4() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_4_4: +define void @s_shuffle_v4p0_v4p0__7_4_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_4_4: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_4_4: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -21019,779 +17976,738 @@ define void @s_shuffle_v4p0_v4p0__7_1_4_4() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_2_4_4() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_4_4: +define void @s_shuffle_v4p0_v4p0__7_5_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_4_4: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_4_4: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[16:23] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s22 -; GFX942-NEXT: s_mov_b32 s9, s23 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s16 -; GFX942-NEXT: s_mov_b32 s13, s17 -; GFX942-NEXT: s_mov_b32 s14, s16 -; GFX942-NEXT: s_mov_b32 s15, s17 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s14, s6 +; GFX942-NEXT: s_mov_b32 s15, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_3_4_4() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_4_4: +define void @s_shuffle_v4p0_v4p0__7_6_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s22 ; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_4_4: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s22 ; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_3_4_4: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[16:23] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s22 -; GFX942-NEXT: s_mov_b32 s9, s23 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s16 -; GFX942-NEXT: s_mov_b32 s13, s17 -; GFX942-NEXT: s_mov_b32 s14, s16 -; GFX942-NEXT: s_mov_b32 s15, s17 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_5_4_4() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_4_4: +define void @s_shuffle_v4p0_v4p0__7_7_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s6 -; GFX900-NEXT: s_mov_b32 s11, s7 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_4_4: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s6 -; GFX90A-NEXT: s_mov_b32 s11, s7 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_4_4: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s2 -; GFX942-NEXT: s_mov_b32 s11, s3 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_6_4_4() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_4_4: +define void @s_shuffle_v4p0_v4p0__7_7_u_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_4_4: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_4_4: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_4_4() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_4: +define void @s_shuffle_v4p0_v4p0__7_7_0_3() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_0_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s14, s18 +; GFX9-NEXT: s_mov_b32 s15, s19 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_1_3() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_1_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s14, s18 +; GFX9-NEXT: s_mov_b32 s15, s19 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_2_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_4: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_4: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_u_4() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_4: +define void @s_shuffle_v4p0_v4p0__7_7_4_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_4: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_4: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_0_4() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_4: +define void @s_shuffle_v4p0_v4p0__7_7_5_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s10, s22 -; GFX900-NEXT: s_mov_b32 s11, s23 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_4: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s10, s22 -; GFX90A-NEXT: s_mov_b32 s11, s23 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_0_4: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s10 ; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_1_4() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_4: +define void @s_shuffle_v4p0_v4p0__7_7_6_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s10, s22 -; GFX900-NEXT: s_mov_b32 s11, s23 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_4: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s10, s22 -; GFX90A-NEXT: s_mov_b32 s11, s23 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_1_4: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s14, s6 +; GFX942-NEXT: s_mov_b32 s15, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__u_4_4_4() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__u_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_2_4() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_4: +define void @s_shuffle_v4p0_v4p0__0_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_4: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_4: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__0_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_3_4() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_4: +define void @s_shuffle_v4p0_v4p0__1_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_4: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_3_4: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__1_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[16:23] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s22 -; GFX942-NEXT: s_mov_b32 s9, s23 -; GFX942-NEXT: s_mov_b32 s10, s22 -; GFX942-NEXT: s_mov_b32 s11, s23 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: s_mov_b32 s14, s16 -; GFX942-NEXT: s_mov_b32 s15, s17 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_5_4() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_4: +define void @s_shuffle_v4p0_v4p0__2_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_4: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_4: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__2_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_6_4() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_4: +define void @s_shuffle_v4p0_v4p0__3_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s10, s22 -; GFX900-NEXT: s_mov_b32 s11, s23 -; GFX900-NEXT: s_mov_b32 s12, s20 -; GFX900-NEXT: s_mov_b32 s13, s21 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_4: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s10, s22 -; GFX90A-NEXT: s_mov_b32 s11, s23 -; GFX90A-NEXT: s_mov_b32 s12, s20 -; GFX90A-NEXT: s_mov_b32 s13, s21 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_4: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -21799,533 +18715,772 @@ define void @s_shuffle_v4p0_v4p0__7_7_6_4() { ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s0 -; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__4_4_4_4() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__4_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__5_4_4_4() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__5_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__u_5_5_5() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__u_5_5_5: +define void @s_shuffle_v4p0_v4p0__6_4_4_4() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__6_4_4_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__0_5_5_5() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_5_5_5: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_5_5_5: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__0_5_5_5: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s2 -; GFX942-NEXT: s_mov_b32 s11, s3 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4p0_v4p0__7_4_4_4() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__1_5_5_5() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_5_5_5: +define void @s_shuffle_v4p0_v4p0__7_u_4_4() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_u_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_0_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 -; GFX900-NEXT: s_mov_b32 s14, s10 -; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_5_5_5: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 -; GFX90A-NEXT: s_mov_b32 s14, s10 -; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__1_5_5_5: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 -; GFX942-NEXT: s_mov_b32 s14, s10 -; GFX942-NEXT: s_mov_b32 s15, s11 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__2_5_5_5() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_5_5_5: +define void @s_shuffle_v4p0_v4p0__7_1_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 -; GFX900-NEXT: s_mov_b32 s14, s10 -; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_5_5_5: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 -; GFX90A-NEXT: s_mov_b32 s14, s10 -; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__2_5_5_5: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 -; GFX942-NEXT: s_mov_b32 s14, s10 -; GFX942-NEXT: s_mov_b32 s15, s11 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__3_5_5_5() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_5_5_5: +define void @s_shuffle_v4p0_v4p0__7_2_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s18 ; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 -; GFX900-NEXT: s_mov_b32 s14, s10 -; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_5_5_5: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s18 ; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 -; GFX90A-NEXT: s_mov_b32 s14, s10 -; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_5_5_5: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s12, s10 -; GFX942-NEXT: s_mov_b32 s13, s11 -; GFX942-NEXT: s_mov_b32 s14, s10 -; GFX942-NEXT: s_mov_b32 s15, s11 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__4_5_5_5() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__4_5_5_5: +define void @s_shuffle_v4p0_v4p0__7_3_4_4() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_3_4_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__5_5_5_5() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__5_5_5_5: +define void @s_shuffle_v4p0_v4p0__7_5_4_4() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_5_4_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s10, s14 +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_6_4_4() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_6_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s10, s16 +; GFX9-NEXT: s_mov_b32 s11, s17 +; GFX9-NEXT: s_mov_b32 s14, s12 +; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_4_4() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__6_5_5_5() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__6_5_5_5: +define void @s_shuffle_v4p0_v4p0__7_7_u_4() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_u_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_5_5_5() { -; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_5_5_5: +define void @s_shuffle_v4p0_v4p0__7_7_0_4() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_0_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[12:19] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s14 -; GFX9-NEXT: s_mov_b32 s9, s15 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_u_5_5() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_5_5: +define void @s_shuffle_v4p0_v4p0__7_7_1_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s10 ; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_5_5: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s10 ; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_5_5: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_1_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 ; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 ; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_0_5_5() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_5_5: +define void @s_shuffle_v4p0_v4p0__7_7_2_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_3_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: s_mov_b32 s12, s14 ; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_5_5: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: s_mov_b32 s12, s14 ; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_5_5: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_mov_b32 s12, s14 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s10 ; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 -; GFX942-NEXT: s_mov_b32 s14, s6 -; GFX942-NEXT: s_mov_b32 s15, s7 +; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_5_4() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_5_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_6_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s22 +; GFX900-NEXT: s_mov_b32 s11, s23 +; GFX900-NEXT: s_mov_b32 s12, s20 +; GFX900-NEXT: s_mov_b32 s13, s21 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s22 +; GFX90A-NEXT: s_mov_b32 s11, s23 +; GFX90A-NEXT: s_mov_b32 s12, s20 +; GFX90A-NEXT: s_mov_b32 s13, s21 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s6 +; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 +; GFX942-NEXT: s_mov_b32 s14, s0 +; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__u_5_5_5() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__u_5_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_1_5_5() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_5_5: +define void @s_shuffle_v4p0_v4p0__0_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -22334,8 +19489,8 @@ define void @s_shuffle_v4p0_v4p0__7_1_5_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 ; GFX900-NEXT: s_mov_b32 s12, s14 ; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART @@ -22343,7 +19498,7 @@ define void @s_shuffle_v4p0_v4p0__7_1_5_5() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_5_5: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -22352,8 +19507,8 @@ define void @s_shuffle_v4p0_v4p0__7_1_5_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 ; GFX90A-NEXT: s_mov_b32 s12, s14 ; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART @@ -22361,241 +19516,385 @@ define void @s_shuffle_v4p0_v4p0__7_1_5_5() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_5_5: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__0_5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_2_5_5() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_5_5: +define void @s_shuffle_v4p0_v4p0__1_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_5_5: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_5_5: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__1_5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:19] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s18 -; GFX942-NEXT: s_mov_b32 s9, s19 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_3_5_5() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_5_5: +define void @s_shuffle_v4p0_v4p0__2_5_5_5() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__2_5_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s14 +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__3_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s18 ; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_5_5: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s18 ; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_3_5_5: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:19] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s18 -; GFX942-NEXT: s_mov_b32 s9, s19 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__4_5_5_5() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__4_5_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__5_5_5_5() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__5_5_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__6_5_5_5() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__6_5_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_5_5_5() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_5_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_u_5_5() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_u_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_4_5_5() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_5_5: +define void @s_shuffle_v4p0_v4p0__7_0_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_5_5: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_5_5: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 ; GFX942-NEXT: s_mov_b32 s10, s0 ; GFX942-NEXT: s_mov_b32 s11, s1 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_6_5_5() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_5_5: +define void @s_shuffle_v4p0_v4p0__7_1_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s18 ; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: s_mov_b32 s12, s14 ; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART @@ -22603,16 +19902,17 @@ define void @s_shuffle_v4p0_v4p0__7_6_5_5() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_5_5: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s18 ; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: s_mov_b32 s12, s14 ; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART @@ -22620,138 +19920,210 @@ define void @s_shuffle_v4p0_v4p0__7_6_5_5() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_5_5: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s4 -; GFX942-NEXT: s_mov_b32 s11, s5 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_5_5() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_5: +define void @s_shuffle_v4p0_v4p0__7_2_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_5: +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_5: +; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) ret void } -define void @s_shuffle_v4p0_v4p0__7_7_u_5() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_5: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_5: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_5: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +define void @s_shuffle_v4p0_v4p0__7_3_5_5() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_3_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_4_5_5() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_4_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_6_5_5() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_6_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s10, s16 +; GFX9-NEXT: s_mov_b32 s11, s17 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_5_5() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_u_5() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_u_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> @@ -22760,65 +20132,23 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_5() { } define void @s_shuffle_v4p0_v4p0__7_7_0_5() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_5: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_5: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_0_5: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s6 -; GFX942-NEXT: s_mov_b32 s15, s7 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_0_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> @@ -22831,17 +20161,17 @@ define void @s_shuffle_v4p0_v4p0__7_7_1_5() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -22851,17 +20181,17 @@ define void @s_shuffle_v4p0_v4p0__7_7_1_5() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -22936,15 +20266,14 @@ define void @s_shuffle_v4p0_v4p0__7_7_2_5() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s14, s6 +; GFX942-NEXT: s_mov_b32 s15, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -23001,17 +20330,17 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_5() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[12:19] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s14 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s18 -; GFX942-NEXT: s_mov_b32 s9, s19 -; GFX942-NEXT: s_mov_b32 s10, s18 -; GFX942-NEXT: s_mov_b32 s11, s19 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s14, s6 +; GFX942-NEXT: s_mov_b32 s15, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -23024,118 +20353,46 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_5() { } define void @s_shuffle_v4p0_v4p0__7_7_4_5() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_5: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_5: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_5: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <4 x ptr> asm "; def $0", "=s"() - %vec1 = call <4 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v4p0__7_7_6_5() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_5: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s12, s16 -; GFX900-NEXT: s_mov_b32 s13, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_5: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s12, s16 -; GFX90A-NEXT: s_mov_b32 s13, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_5: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s4 -; GFX942-NEXT: s_mov_b32 s13, s5 -; GFX942-NEXT: s_mov_b32 s14, s2 -; GFX942-NEXT: s_mov_b32 s15, s3 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_4_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_6_5() { +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_6_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s10, s18 +; GFX9-NEXT: s_mov_b32 s11, s19 +; GFX9-NEXT: s_mov_b32 s12, s16 +; GFX9-NEXT: s_mov_b32 s13, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> @@ -23304,17 +20561,17 @@ define void @s_shuffle_v4p0_v4p0__2_6_6_6() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -23324,17 +20581,17 @@ define void @s_shuffle_v4p0_v4p0__2_6_6_6() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -23344,17 +20601,18 @@ define void @s_shuffle_v4p0_v4p0__2_6_6_6() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -23484,14 +20742,14 @@ define void @s_shuffle_v4p0_v4p0__6_6_6_6() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s15, s9 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND @@ -23755,17 +21013,17 @@ define void @s_shuffle_v4p0_v4p0__7_3_6_6() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -23775,17 +21033,17 @@ define void @s_shuffle_v4p0_v4p0__7_3_6_6() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -23795,17 +21053,18 @@ define void @s_shuffle_v4p0_v4p0__7_3_6_6() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s14, s12 -; GFX942-NEXT: s_mov_b32 s15, s13 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s12, s4 +; GFX942-NEXT: s_mov_b32 s13, s5 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -23990,19 +21249,17 @@ define void @s_shuffle_v4p0_v4p0__7_7_0_6() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s22 +; GFX900-NEXT: s_mov_b32 s11, s23 +; GFX900-NEXT: s_mov_b32 s14, s20 +; GFX900-NEXT: s_mov_b32 s15, s21 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -24012,19 +21269,17 @@ define void @s_shuffle_v4p0_v4p0__7_7_0_6() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s22 +; GFX90A-NEXT: s_mov_b32 s11, s23 +; GFX90A-NEXT: s_mov_b32 s14, s20 +; GFX90A-NEXT: s_mov_b32 s15, s21 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -24039,14 +21294,12 @@ define void @s_shuffle_v4p0_v4p0__7_7_0_6() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s18 -; GFX942-NEXT: s_mov_b32 s9, s19 -; GFX942-NEXT: s_mov_b32 s10, s18 -; GFX942-NEXT: s_mov_b32 s11, s19 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s16 -; GFX942-NEXT: s_mov_b32 s15, s17 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s6 +; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -24272,58 +21525,22 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_6() { } define void @s_shuffle_v4p0_v4p0__7_7_4_6() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_6: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s15, s17 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_6: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s14, s16 -; GFX90A-NEXT: s_mov_b32 s15, s17 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_6: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s4 -; GFX942-NEXT: s_mov_b32 s15, s5 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_4_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s18 +; GFX9-NEXT: s_mov_b32 s9, s19 +; GFX9-NEXT: s_mov_b32 s10, s18 +; GFX9-NEXT: s_mov_b32 s11, s19 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: s_mov_b32 s15, s17 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> @@ -24400,12 +21617,12 @@ define void @s_shuffle_v4p0_v4p0__u_7_7_7() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 -; GFX9-NEXT: s_mov_b32 s12, s14 -; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND @@ -24489,17 +21706,17 @@ define void @s_shuffle_v4p0_v4p0__1_7_7_7() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s7 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -24509,17 +21726,17 @@ define void @s_shuffle_v4p0_v4p0__1_7_7_7() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -24529,17 +21746,18 @@ define void @s_shuffle_v4p0_v4p0__1_7_7_7() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s2 ; GFX942-NEXT: s_mov_b32 s9, s3 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -24556,17 +21774,17 @@ define void @s_shuffle_v4p0_v4p0__2_7_7_7() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s10, s18 +; GFX900-NEXT: s_mov_b32 s11, s19 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -24576,17 +21794,17 @@ define void @s_shuffle_v4p0_v4p0__2_7_7_7() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s10, s18 +; GFX90A-NEXT: s_mov_b32 s11, s19 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -24596,17 +21814,18 @@ define void @s_shuffle_v4p0_v4p0__2_7_7_7() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s4 -; GFX942-NEXT: s_mov_b32 s9, s5 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s10, s6 +; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s14, s6 +; GFX942-NEXT: s_mov_b32 s15, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -24623,17 +21842,17 @@ define void @s_shuffle_v4p0_v4p0__3_7_7_7() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -24643,17 +21862,17 @@ define void @s_shuffle_v4p0_v4p0__3_7_7_7() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -24665,15 +21884,16 @@ define void @s_shuffle_v4p0_v4p0__3_7_7_7() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -24712,14 +21932,14 @@ define void @s_shuffle_v4p0_v4p0__5_7_7_7() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 -; GFX9-NEXT: s_mov_b32 s12, s14 -; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND @@ -24736,14 +21956,12 @@ define void @s_shuffle_v4p0_v4p0__6_7_7_7() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 -; GFX9-NEXT: s_mov_b32 s12, s14 -; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s12, s10 +; GFX9-NEXT: s_mov_b32 s13, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND @@ -24983,17 +22201,17 @@ define void @s_shuffle_v4p0_v4p0__7_3_7_7() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -25003,17 +22221,17 @@ define void @s_shuffle_v4p0_v4p0__7_3_7_7() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -25023,17 +22241,18 @@ define void @s_shuffle_v4p0_v4p0__7_3_7_7() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s14 -; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s14, s6 +; GFX942-NEXT: s_mov_b32 s15, s7 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -25160,12 +22379,12 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_7() { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ; def s[4:11] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s14 -; GFX9-NEXT: s_mov_b32 s9, s15 -; GFX9-NEXT: s_mov_b32 s10, s14 -; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND @@ -25178,65 +22397,23 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_7() { } define void @s_shuffle_v4p0_v4p0__7_7_0_7() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_7: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_7: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_0_7: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_0_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[12:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> @@ -25249,17 +22426,17 @@ define void @s_shuffle_v4p0_v4p0__7_7_1_7() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -25269,17 +22446,17 @@ define void @s_shuffle_v4p0_v4p0__7_7_1_7() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -25289,17 +22466,17 @@ define void @s_shuffle_v4p0_v4p0__7_7_1_7() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 ; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 ; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -25319,14 +22496,12 @@ define void @s_shuffle_v4p0_v4p0__7_7_2_7() { ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:23] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s22 -; GFX900-NEXT: s_mov_b32 s9, s23 -; GFX900-NEXT: s_mov_b32 s10, s22 -; GFX900-NEXT: s_mov_b32 s11, s23 -; GFX900-NEXT: s_mov_b32 s14, s22 -; GFX900-NEXT: s_mov_b32 s15, s23 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -25339,14 +22514,12 @@ define void @s_shuffle_v4p0_v4p0__7_7_2_7() { ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:23] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s22 -; GFX90A-NEXT: s_mov_b32 s9, s23 -; GFX90A-NEXT: s_mov_b32 s10, s22 -; GFX90A-NEXT: s_mov_b32 s11, s23 -; GFX90A-NEXT: s_mov_b32 s14, s22 -; GFX90A-NEXT: s_mov_b32 s15, s23 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -25358,15 +22531,14 @@ define void @s_shuffle_v4p0_v4p0__7_7_2_7() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s14, s6 -; GFX942-NEXT: s_mov_b32 s15, s7 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -25383,17 +22555,17 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_7() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s18 -; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -25403,17 +22575,17 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_7() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s18 -; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -25425,15 +22597,15 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_7() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s14 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 -; GFX942-NEXT: s_mov_b32 s12, s6 -; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] ; GFX942-NEXT: ;;#ASMEND @@ -25446,58 +22618,22 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_7() { } define void @s_shuffle_v4p0_v4p0__7_7_4_7() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_7: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s14, s18 -; GFX900-NEXT: s_mov_b32 s15, s19 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_7: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s14, s18 -; GFX90A-NEXT: s_mov_b32 s15, s19 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_7: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s0 -; GFX942-NEXT: s_mov_b32 s13, s1 -; GFX942-NEXT: s_mov_b32 s14, s6 -; GFX942-NEXT: s_mov_b32 s15, s7 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_4_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> @@ -25506,62 +22642,22 @@ define void @s_shuffle_v4p0_v4p0__7_7_4_7() { } define void @s_shuffle_v4p0_v4p0__7_7_5_7() { -; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_7: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s10, s18 -; GFX900-NEXT: s_mov_b32 s11, s19 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 -; GFX900-NEXT: s_mov_b32 s14, s18 -; GFX900-NEXT: s_mov_b32 s15, s19 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:15] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_7: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s10, s18 -; GFX90A-NEXT: s_mov_b32 s11, s19 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 -; GFX90A-NEXT: s_mov_b32 s14, s18 -; GFX90A-NEXT: s_mov_b32 s15, s19 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:15] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_7: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s7 -; GFX942-NEXT: s_mov_b32 s10, s6 -; GFX942-NEXT: s_mov_b32 s11, s7 -; GFX942-NEXT: s_mov_b32 s12, s2 -; GFX942-NEXT: s_mov_b32 s13, s3 -; GFX942-NEXT: s_mov_b32 s14, s6 -; GFX942-NEXT: s_mov_b32 s15, s7 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_5_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=s"() %vec1 = call <4 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll index 90a1b99dc7c14..1cf5c6cd3f286 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v4p3_v2p3__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__1_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v2, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -111,12 +110,11 @@ define void @v_shuffle_v4p3_v2p3__3_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v2, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -154,15 +152,14 @@ define void @v_shuffle_v4p3_v2p3__3_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_0_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -207,15 +204,14 @@ define void @v_shuffle_v4p3_v2p3__3_1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v3, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -262,10 +258,10 @@ define void @v_shuffle_v4p3_v2p3__3_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx4 v3, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -306,12 +302,12 @@ define void @v_shuffle_v4p3_v2p3__3_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx4 v3, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -349,15 +345,15 @@ define void @v_shuffle_v4p3_v2p3__3_3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -402,14 +398,14 @@ define void @v_shuffle_v4p3_v2p3__3_3_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -457,11 +453,11 @@ define void @v_shuffle_v4p3_v2p3__3_3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -502,13 +498,13 @@ define void @v_shuffle_v4p3_v2p3__3_3_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -548,16 +544,16 @@ define void @v_shuffle_v4p3_v2p3__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -604,15 +600,15 @@ define void @v_shuffle_v4p3_v2p3__3_3_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -660,12 +656,12 @@ define void @v_shuffle_v4p3_v2p3__3_3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -705,14 +701,14 @@ define void @v_shuffle_v4p3_v2p3__3_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -850,14 +846,14 @@ define void @v_shuffle_v4p3_v2p3__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__1_0_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -945,16 +941,15 @@ define void @v_shuffle_v4p3_v2p3__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1001,15 +996,14 @@ define void @v_shuffle_v4p3_v2p3__3_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v2, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1056,16 +1050,15 @@ define void @v_shuffle_v4p3_v2p3__3_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1114,15 +1107,15 @@ define void @v_shuffle_v4p3_v2p3__3_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1169,15 +1162,15 @@ define void @v_shuffle_v4p3_v2p3__3_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1223,16 +1216,15 @@ define void @v_shuffle_v4p3_v2p3__3_3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v3, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1278,16 +1270,16 @@ define void @v_shuffle_v4p3_v2p3__3_3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1333,17 +1325,16 @@ define void @v_shuffle_v4p3_v2p3__3_3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1483,14 +1474,14 @@ define void @v_shuffle_v4p3_v2p3__1_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__1_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1577,17 +1568,16 @@ define void @v_shuffle_v4p3_v2p3__3_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1635,16 +1625,15 @@ define void @v_shuffle_v4p3_v2p3__3_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_u_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1690,16 +1679,15 @@ define void @v_shuffle_v4p3_v2p3__3_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_0_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1748,13 +1736,13 @@ define void @v_shuffle_v4p3_v2p3__3_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1801,15 +1789,15 @@ define void @v_shuffle_v4p3_v2p3__3_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1855,15 +1843,15 @@ define void @v_shuffle_v4p3_v2p3__3_3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1907,15 +1895,15 @@ define void @v_shuffle_v4p3_v2p3__3_3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1962,14 +1950,13 @@ define void @v_shuffle_v4p3_v2p3__3_3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2067,12 +2054,11 @@ define void @v_shuffle_v4p3_v2p3__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__1_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v2, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2120,14 +2106,14 @@ define void @v_shuffle_v4p3_v2p3__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2167,13 +2153,13 @@ define void @v_shuffle_v4p3_v2p3__3_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_u_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v2, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2214,15 +2200,15 @@ define void @v_shuffle_v4p3_v2p3__3_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2269,15 +2255,15 @@ define void @v_shuffle_v4p3_v2p3__3_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2323,14 +2309,14 @@ define void @v_shuffle_v4p3_v2p3__3_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2372,14 +2358,13 @@ define void @v_shuffle_v4p3_v2p3__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v3, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2423,15 +2408,15 @@ define void @v_shuffle_v4p3_v2p3__3_3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2481,15 +2466,15 @@ define void @v_shuffle_v4p3_v2p3__3_3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2638,16 +2623,15 @@ define void @v_shuffle_v4p3_v2p3__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2742,13 +2726,13 @@ define void @v_shuffle_v4p3_v2p3__3_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_u_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v2, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2789,15 +2773,15 @@ define void @v_shuffle_v4p3_v2p3__3_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2844,15 +2828,15 @@ define void @v_shuffle_v4p3_v2p3__3_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2898,13 +2882,14 @@ define void @v_shuffle_v4p3_v2p3__3_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_2_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2944,13 +2929,13 @@ define void @v_shuffle_v4p3_v2p3__3_3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v3, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2991,15 +2976,15 @@ define void @v_shuffle_v4p3_v2p3__3_3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3046,15 +3031,15 @@ define void @v_shuffle_v4p3_v2p3__3_3_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3102,13 +3087,14 @@ define void @v_shuffle_v4p3_v2p3__3_3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll index bcb20e85b2e94..3253b4914420f 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v4p3_v3p3__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__1_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v3, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -100,36 +99,33 @@ define void @v_shuffle_v4p3_v3p3__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__2_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__2_u_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -153,12 +149,11 @@ define void @v_shuffle_v4p3_v3p3__4_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__4_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v3, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -196,36 +191,33 @@ define void @v_shuffle_v4p3_v3p3__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_u_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -239,48 +231,45 @@ define void @v_shuffle_v4p3_v3p3__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_0_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_0_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_0_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -294,46 +283,43 @@ define void @v_shuffle_v4p3_v3p3__5_1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_1_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_1_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -347,16 +333,14 @@ define void @v_shuffle_v4p3_v3p3__5_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_2_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -364,15 +348,14 @@ define void @v_shuffle_v4p3_v3p3__5_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -380,15 +363,14 @@ define void @v_shuffle_v4p3_v3p3__5_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -404,37 +386,35 @@ define void @v_shuffle_v4p3_v3p3__5_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_3_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -448,36 +428,37 @@ define void @v_shuffle_v4p3_v3p3__5_4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_4_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_4_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_4_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -491,39 +472,37 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -537,51 +516,46 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_0_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_0_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -596,15 +570,14 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -612,16 +585,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -629,17 +601,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -654,15 +624,14 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -670,15 +639,14 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -686,16 +654,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -711,40 +678,38 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -758,42 +723,40 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -807,39 +770,40 @@ define void @v_shuffle_v4p3_v3p3__5_5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -853,50 +817,51 @@ define void @v_shuffle_v4p3_v3p3__5_5_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx4 v9, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v9, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: global_store_dwordx4 v9, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -911,15 +876,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -927,15 +892,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -943,16 +908,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -967,15 +932,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -983,16 +948,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1000,16 +965,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1025,43 +990,41 @@ define void @v_shuffle_v4p3_v3p3__5_5_5_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_5_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_5_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1075,45 +1038,43 @@ define void @v_shuffle_v4p3_v3p3__5_5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1127,42 +1088,43 @@ define void @v_shuffle_v4p3_v3p3__5_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1189,29 +1151,26 @@ define void @v_shuffle_v4p3_v3p3__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__u_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__u_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1273,42 +1232,39 @@ define void @v_shuffle_v4p3_v3p3__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__1_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__1_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1321,45 +1277,43 @@ define void @v_shuffle_v4p3_v3p3__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__2_0_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__2_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__2_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1385,29 +1339,26 @@ define void @v_shuffle_v4p3_v3p3__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__3_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__3_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1421,16 +1372,15 @@ define void @v_shuffle_v4p3_v3p3__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1438,15 +1388,14 @@ define void @v_shuffle_v4p3_v3p3__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1455,16 +1404,15 @@ define void @v_shuffle_v4p3_v3p3__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -1480,16 +1428,15 @@ define void @v_shuffle_v4p3_v3p3__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1497,17 +1444,15 @@ define void @v_shuffle_v4p3_v3p3__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1515,17 +1460,15 @@ define void @v_shuffle_v4p3_v3p3__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1540,49 +1483,44 @@ define void @v_shuffle_v4p3_v3p3__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v5, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v5, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1597,16 +1535,15 @@ define void @v_shuffle_v4p3_v3p3__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1614,17 +1551,15 @@ define void @v_shuffle_v4p3_v3p3__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1632,17 +1567,15 @@ define void @v_shuffle_v4p3_v3p3__5_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1657,16 +1590,15 @@ define void @v_shuffle_v4p3_v3p3__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1676,15 +1608,13 @@ define void @v_shuffle_v4p3_v3p3__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1694,15 +1624,13 @@ define void @v_shuffle_v4p3_v3p3__5_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1717,16 +1645,15 @@ define void @v_shuffle_v4p3_v3p3__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1734,17 +1661,15 @@ define void @v_shuffle_v4p3_v3p3__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1752,17 +1677,15 @@ define void @v_shuffle_v4p3_v3p3__5_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1777,15 +1700,15 @@ define void @v_shuffle_v4p3_v3p3__5_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1793,16 +1716,15 @@ define void @v_shuffle_v4p3_v3p3__5_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1810,16 +1732,15 @@ define void @v_shuffle_v4p3_v3p3__5_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1834,16 +1755,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1851,17 +1771,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1869,17 +1787,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1893,17 +1809,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1911,16 +1825,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1928,16 +1841,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1951,17 +1863,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1969,16 +1880,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1986,17 +1896,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2010,53 +1918,51 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_2_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 ; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v9, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_2_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v9, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v8 ; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v9, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2070,53 +1976,51 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx4 v9, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v9, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: global_store_dwordx4 v9, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2130,17 +2034,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2148,16 +2051,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2165,17 +2067,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2282,11 +2183,11 @@ define void @v_shuffle_v4p3_v3p3__1_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2327,42 +2228,43 @@ define void @v_shuffle_v4p3_v3p3__2_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__2_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__2_1_1_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__2_1_1_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2421,16 +2323,15 @@ define void @v_shuffle_v4p3_v3p3__4_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2480,16 +2381,15 @@ define void @v_shuffle_v4p3_v3p3__5_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2497,16 +2397,15 @@ define void @v_shuffle_v4p3_v3p3__5_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2514,17 +2413,15 @@ define void @v_shuffle_v4p3_v3p3__5_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2539,15 +2436,14 @@ define void @v_shuffle_v4p3_v3p3__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2555,16 +2451,14 @@ define void @v_shuffle_v4p3_v3p3__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2572,17 +2466,14 @@ define void @v_shuffle_v4p3_v3p3__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2597,15 +2488,14 @@ define void @v_shuffle_v4p3_v3p3__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2613,17 +2503,15 @@ define void @v_shuffle_v4p3_v3p3__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2631,17 +2519,15 @@ define void @v_shuffle_v4p3_v3p3__5_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2656,16 +2542,15 @@ define void @v_shuffle_v4p3_v3p3__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2675,15 +2560,13 @@ define void @v_shuffle_v4p3_v3p3__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2693,15 +2576,13 @@ define void @v_shuffle_v4p3_v3p3__5_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2718,14 +2599,13 @@ define void @v_shuffle_v4p3_v3p3__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2733,17 +2613,15 @@ define void @v_shuffle_v4p3_v3p3__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2751,17 +2629,15 @@ define void @v_shuffle_v4p3_v3p3__5_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2776,15 +2652,15 @@ define void @v_shuffle_v4p3_v3p3__5_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2792,15 +2668,15 @@ define void @v_shuffle_v4p3_v3p3__5_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2808,16 +2684,15 @@ define void @v_shuffle_v4p3_v3p3__5_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2832,16 +2707,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2849,17 +2723,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2867,17 +2739,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2891,51 +2761,46 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_u_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2949,52 +2814,46 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_0_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_0_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3009,16 +2868,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3028,15 +2886,13 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3046,15 +2902,13 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3069,16 +2923,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3086,17 +2939,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3104,17 +2955,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3129,16 +2979,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3146,17 +2995,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3164,17 +3011,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3188,13 +3034,13 @@ define void @v_shuffle_v4p3_v3p3__u_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__u_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3278,14 +3124,13 @@ define void @v_shuffle_v4p3_v3p3__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__1_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3324,42 +3169,43 @@ define void @v_shuffle_v4p3_v3p3__2_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__2_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__2_2_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__2_2_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3372,13 +3218,13 @@ define void @v_shuffle_v4p3_v3p3__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__3_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3418,16 +3264,15 @@ define void @v_shuffle_v4p3_v3p3__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3474,17 +3319,16 @@ define void @v_shuffle_v4p3_v3p3__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3492,16 +3336,15 @@ define void @v_shuffle_v4p3_v3p3__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3509,16 +3352,15 @@ define void @v_shuffle_v4p3_v3p3__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3532,48 +3374,46 @@ define void @v_shuffle_v4p3_v3p3__5_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_u_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_u_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3588,15 +3428,14 @@ define void @v_shuffle_v4p3_v3p3__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3606,15 +3445,14 @@ define void @v_shuffle_v4p3_v3p3__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3624,15 +3462,14 @@ define void @v_shuffle_v4p3_v3p3__5_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3646,48 +3483,46 @@ define void @v_shuffle_v4p3_v3p3__5_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_1_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_1_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3704,14 +3539,13 @@ define void @v_shuffle_v4p3_v3p3__5_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3719,16 +3553,15 @@ define void @v_shuffle_v4p3_v3p3__5_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3736,16 +3569,15 @@ define void @v_shuffle_v4p3_v3p3__5_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3760,15 +3592,15 @@ define void @v_shuffle_v4p3_v3p3__5_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3778,14 +3610,13 @@ define void @v_shuffle_v4p3_v3p3__5_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3795,14 +3626,13 @@ define void @v_shuffle_v4p3_v3p3__5_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3817,16 +3647,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3834,16 +3663,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3851,16 +3679,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3875,15 +3702,14 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3891,16 +3717,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3908,16 +3733,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 -; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3932,16 +3756,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[5:7] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3951,15 +3774,13 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3969,15 +3790,13 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3992,15 +3811,14 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4008,16 +3826,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4025,16 +3842,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4049,53 +3866,49 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v6 -; GFX90A-NEXT: v_mov_b32_e32 v9, v6 -; GFX90A-NEXT: v_mov_b32_e32 v10, v4 -; GFX90A-NEXT: v_mov_b32_e32 v11, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v8, v6 -; GFX942-NEXT: v_mov_b32_e32 v9, v6 -; GFX942-NEXT: v_mov_b32_e32 v10, v4 -; GFX942-NEXT: v_mov_b32_e32 v11, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4110,16 +3923,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4127,16 +3939,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4144,17 +3955,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4218,12 +4028,11 @@ define void @v_shuffle_v4p3_v3p3__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__1_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v3, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4260,36 +4069,33 @@ define void @v_shuffle_v4p3_v3p3__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__2_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__2_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__2_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4314,42 +4120,39 @@ define void @v_shuffle_v4p3_v3p3__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__4_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__4_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4363,45 +4166,43 @@ define void @v_shuffle_v4p3_v3p3__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4415,14 +4216,13 @@ define void @v_shuffle_v4p3_v3p3__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_u_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4464,15 +4264,15 @@ define void @v_shuffle_v4p3_v3p3__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4480,17 +4280,16 @@ define void @v_shuffle_v4p3_v3p3__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4498,17 +4297,16 @@ define void @v_shuffle_v4p3_v3p3__5_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4523,15 +4321,15 @@ define void @v_shuffle_v4p3_v3p3__5_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4539,16 +4337,15 @@ define void @v_shuffle_v4p3_v3p3__5_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4556,17 +4353,16 @@ define void @v_shuffle_v4p3_v3p3__5_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4581,51 +4377,49 @@ define void @v_shuffle_v4p3_v3p3__5_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_2_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v9, v4 -; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_2_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v8, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v4 -; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4641,43 +4435,41 @@ define void @v_shuffle_v4p3_v3p3__5_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_4_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_4_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4693,43 +4485,41 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4743,43 +4533,40 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_u_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4794,15 +4581,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4810,17 +4597,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4828,17 +4613,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4853,15 +4637,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4869,16 +4653,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4886,17 +4669,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4911,15 +4693,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4927,16 +4709,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4944,17 +4725,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4970,40 +4750,39 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_4_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5120,16 +4899,15 @@ define void @v_shuffle_v4p3_v3p3__1_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5179,16 +4957,15 @@ define void @v_shuffle_v4p3_v3p3__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5196,16 +4973,15 @@ define void @v_shuffle_v4p3_v3p3__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5213,17 +4989,15 @@ define void @v_shuffle_v4p3_v3p3__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5286,11 +5060,11 @@ define void @v_shuffle_v4p3_v3p3__4_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5332,42 +5106,43 @@ define void @v_shuffle_v4p3_v3p3__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5381,13 +5156,13 @@ define void @v_shuffle_v4p3_v3p3__5_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_u_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5429,15 +5204,15 @@ define void @v_shuffle_v4p3_v3p3__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5445,17 +5220,16 @@ define void @v_shuffle_v4p3_v3p3__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5463,17 +5237,16 @@ define void @v_shuffle_v4p3_v3p3__5_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5488,15 +5261,15 @@ define void @v_shuffle_v4p3_v3p3__5_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5504,15 +5277,15 @@ define void @v_shuffle_v4p3_v3p3__5_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5520,16 +5293,16 @@ define void @v_shuffle_v4p3_v3p3__5_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5544,51 +5317,49 @@ define void @v_shuffle_v4p3_v3p3__5_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_2_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_2_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5602,44 +5373,43 @@ define void @v_shuffle_v4p3_v3p3__5_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_3_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_3_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5653,45 +5423,43 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5705,43 +5473,40 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5756,16 +5521,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5773,17 +5537,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5791,17 +5553,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5816,16 +5577,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5833,17 +5593,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5851,17 +5610,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5876,16 +5634,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5893,16 +5650,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5910,17 +5666,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5934,44 +5689,43 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5985,13 +5739,13 @@ define void @v_shuffle_v4p3_v3p3__u_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__u_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6090,16 +5844,15 @@ define void @v_shuffle_v4p3_v3p3__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6147,16 +5900,15 @@ define void @v_shuffle_v4p3_v3p3__2_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6164,16 +5916,16 @@ define void @v_shuffle_v4p3_v3p3__2_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6181,17 +5933,16 @@ define void @v_shuffle_v4p3_v3p3__2_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6251,14 +6002,13 @@ define void @v_shuffle_v4p3_v3p3__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__4_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6298,39 +6048,40 @@ define void @v_shuffle_v4p3_v3p3__5_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_u_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_u_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6345,16 +6096,15 @@ define void @v_shuffle_v4p3_v3p3__5_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6362,16 +6112,16 @@ define void @v_shuffle_v4p3_v3p3__5_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6379,16 +6129,16 @@ define void @v_shuffle_v4p3_v3p3__5_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6403,16 +6153,15 @@ define void @v_shuffle_v4p3_v3p3__5_1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6420,16 +6169,15 @@ define void @v_shuffle_v4p3_v3p3__5_1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6437,17 +6185,16 @@ define void @v_shuffle_v4p3_v3p3__5_1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6464,14 +6211,13 @@ define void @v_shuffle_v4p3_v3p3__5_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6481,14 +6227,14 @@ define void @v_shuffle_v4p3_v3p3__5_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6498,15 +6244,14 @@ define void @v_shuffle_v4p3_v3p3__5_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6520,44 +6265,43 @@ define void @v_shuffle_v4p3_v3p3__5_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_3_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_3_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6571,39 +6315,43 @@ define void @v_shuffle_v4p3_v3p3__5_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_4_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_4_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_4_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6617,42 +6365,40 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_u_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6667,16 +6413,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6684,17 +6429,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6702,17 +6445,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v4 -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_mov_b32_e32 v9, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v7, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6727,16 +6469,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6746,14 +6487,13 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6763,15 +6503,14 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6786,16 +6525,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6803,16 +6541,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6820,17 +6557,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6844,45 +6580,43 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_3_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6896,42 +6630,41 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v5, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll index 1684b94cfd452..9672a7d0c0e8e 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v4p3_v4p3__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__1_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -100,36 +99,33 @@ define void @v_shuffle_v4p3_v4p3__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__2_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__2_u_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -142,12 +138,11 @@ define void @v_shuffle_v4p3_v4p3__3_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__3_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -195,12 +190,11 @@ define void @v_shuffle_v4p3_v4p3__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__5_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -238,36 +232,33 @@ define void @v_shuffle_v4p3_v4p3__6_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__6_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__6_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__6_u_u_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -281,12 +272,11 @@ define void @v_shuffle_v4p3_v4p3__7_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -324,16 +314,14 @@ define void @v_shuffle_v4p3_v4p3__7_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_0_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -378,15 +366,14 @@ define void @v_shuffle_v4p3_v4p3__7_1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -431,16 +418,14 @@ define void @v_shuffle_v4p3_v4p3__7_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_2_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -485,16 +470,14 @@ define void @v_shuffle_v4p3_v4p3__7_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -508,9 +491,8 @@ define void @v_shuffle_v4p3_v4p3__7_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -524,9 +506,9 @@ define void @v_shuffle_v4p3_v4p3__7_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -540,13 +522,12 @@ define void @v_shuffle_v4p3_v4p3__7_4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_4_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -584,12 +565,12 @@ define void @v_shuffle_v4p3_v4p3__7_5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_5_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -627,13 +608,12 @@ define void @v_shuffle_v4p3_v4p3__7_6_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_6_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -671,13 +651,12 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -688,9 +667,8 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -701,9 +679,8 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -717,17 +694,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -737,14 +712,12 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -754,15 +727,12 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -777,16 +747,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -794,16 +762,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -811,17 +778,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -836,15 +801,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -852,15 +816,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -868,16 +831,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_2_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -892,16 +854,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -909,16 +869,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -926,16 +885,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -949,43 +907,39 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -999,42 +953,39 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1048,13 +999,13 @@ define void @v_shuffle_v4p3_v4p3__7_7_6_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_6_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1094,14 +1045,13 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_7_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1111,11 +1061,10 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1125,11 +1074,10 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1143,18 +1091,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_7_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1162,16 +1108,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1179,17 +1124,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1204,17 +1147,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1222,17 +1163,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1240,18 +1179,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1266,17 +1202,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v6 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1284,16 +1218,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1301,17 +1234,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1326,16 +1258,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1343,16 +1274,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1360,17 +1290,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1384,14 +1313,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_7_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1431,46 +1360,42 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_7_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_7_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v3 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_7_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v3 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1484,15 +1409,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_7_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v2 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1532,14 +1456,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1549,11 +1473,11 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1563,11 +1487,11 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1582,13 +1506,12 @@ define void @v_shuffle_v4p3_v4p3__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1596,13 +1519,12 @@ define void @v_shuffle_v4p3_v4p3__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1610,13 +1532,12 @@ define void @v_shuffle_v4p3_v4p3__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1692,13 +1613,12 @@ define void @v_shuffle_v4p3_v4p3__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1706,13 +1626,12 @@ define void @v_shuffle_v4p3_v4p3__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1728,43 +1647,39 @@ define void @v_shuffle_v4p3_v4p3__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v0 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__2_0_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__2_0_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1777,15 +1692,14 @@ define void @v_shuffle_v4p3_v4p3__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__3_0_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1793,13 +1707,12 @@ define void @v_shuffle_v4p3_v4p3__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1807,13 +1720,12 @@ define void @v_shuffle_v4p3_v4p3__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1827,13 +1739,12 @@ define void @v_shuffle_v4p3_v4p3__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1841,13 +1752,12 @@ define void @v_shuffle_v4p3_v4p3__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1855,13 +1765,12 @@ define void @v_shuffle_v4p3_v4p3__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1877,15 +1786,13 @@ define void @v_shuffle_v4p3_v4p3__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1893,16 +1800,15 @@ define void @v_shuffle_v4p3_v4p3__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1910,17 +1816,16 @@ define void @v_shuffle_v4p3_v4p3__5_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1935,17 +1840,15 @@ define void @v_shuffle_v4p3_v4p3__6_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1953,17 +1856,15 @@ define void @v_shuffle_v4p3_v4p3__6_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1971,18 +1872,15 @@ define void @v_shuffle_v4p3_v4p3__6_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1997,17 +1895,15 @@ define void @v_shuffle_v4p3_v4p3__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2015,16 +1911,15 @@ define void @v_shuffle_v4p3_v4p3__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2032,17 +1927,16 @@ define void @v_shuffle_v4p3_v4p3__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2057,16 +1951,14 @@ define void @v_shuffle_v4p3_v4p3__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2074,16 +1966,15 @@ define void @v_shuffle_v4p3_v4p3__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2091,17 +1982,15 @@ define void @v_shuffle_v4p3_v4p3__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2116,17 +2005,15 @@ define void @v_shuffle_v4p3_v4p3__7_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v0 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2134,17 +2021,16 @@ define void @v_shuffle_v4p3_v4p3__7_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2152,18 +2038,16 @@ define void @v_shuffle_v4p3_v4p3__7_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2178,16 +2062,15 @@ define void @v_shuffle_v4p3_v4p3__7_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v7, v[1:4], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2195,16 +2078,15 @@ define void @v_shuffle_v4p3_v4p3__7_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2212,16 +2094,15 @@ define void @v_shuffle_v4p3_v4p3__7_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2236,17 +2117,15 @@ define void @v_shuffle_v4p3_v4p3__7_3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v8, v[1:4], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2295,17 +2174,15 @@ define void @v_shuffle_v4p3_v4p3__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v0 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2313,16 +2190,15 @@ define void @v_shuffle_v4p3_v4p3__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2330,17 +2206,15 @@ define void @v_shuffle_v4p3_v4p3__7_4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2355,16 +2229,15 @@ define void @v_shuffle_v4p3_v4p3__7_5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2414,17 +2287,15 @@ define void @v_shuffle_v4p3_v4p3__7_6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2432,16 +2303,15 @@ define void @v_shuffle_v4p3_v4p3__7_6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2449,17 +2319,15 @@ define void @v_shuffle_v4p3_v4p3__7_6_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2474,17 +2342,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2492,17 +2358,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2510,18 +2374,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2535,17 +2396,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2553,16 +2412,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2570,17 +2428,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2594,18 +2450,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, v0 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2613,16 +2467,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2630,17 +2483,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2654,18 +2505,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v8 +; GFX900-NEXT: global_store_dwordx4 v10, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2673,17 +2522,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2691,17 +2539,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_2_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v7 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v8 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2715,17 +2562,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v8, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2733,16 +2579,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2750,16 +2595,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2773,18 +2617,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_4_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, v0 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2794,15 +2636,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v2 -; GFX90A-NEXT: v_mov_b32_e32 v11, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2812,15 +2653,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v11, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v2 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2834,17 +2674,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2852,16 +2691,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2869,17 +2707,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2893,17 +2729,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_6_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_6_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[6:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3045,11 +2880,11 @@ define void @v_shuffle_v4p3_v4p3__1_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3093,11 +2928,11 @@ define void @v_shuffle_v4p3_v4p3__2_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3107,11 +2942,11 @@ define void @v_shuffle_v4p3_v4p3__2_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3121,11 +2956,11 @@ define void @v_shuffle_v4p3_v4p3__2_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3138,14 +2973,14 @@ define void @v_shuffle_v4p3_v4p3__3_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__3_1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3232,16 +3067,15 @@ define void @v_shuffle_v4p3_v4p3__5_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3291,16 +3125,15 @@ define void @v_shuffle_v4p3_v4p3__6_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3308,16 +3141,15 @@ define void @v_shuffle_v4p3_v4p3__6_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3325,17 +3157,15 @@ define void @v_shuffle_v4p3_v4p3__6_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3350,16 +3180,15 @@ define void @v_shuffle_v4p3_v4p3__7_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3409,16 +3238,14 @@ define void @v_shuffle_v4p3_v4p3__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3426,16 +3253,15 @@ define void @v_shuffle_v4p3_v4p3__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3443,17 +3269,15 @@ define void @v_shuffle_v4p3_v4p3__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3468,17 +3292,14 @@ define void @v_shuffle_v4p3_v4p3__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3486,16 +3307,15 @@ define void @v_shuffle_v4p3_v4p3__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3503,17 +3323,16 @@ define void @v_shuffle_v4p3_v4p3__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3528,17 +3347,15 @@ define void @v_shuffle_v4p3_v4p3__7_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v1 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3546,16 +3363,15 @@ define void @v_shuffle_v4p3_v4p3__7_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[6:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3563,16 +3379,15 @@ define void @v_shuffle_v4p3_v4p3__7_2_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[6:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3587,16 +3402,15 @@ define void @v_shuffle_v4p3_v4p3__7_3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3645,17 +3459,15 @@ define void @v_shuffle_v4p3_v4p3__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v1 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3663,16 +3475,15 @@ define void @v_shuffle_v4p3_v4p3__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3680,17 +3491,15 @@ define void @v_shuffle_v4p3_v4p3__7_4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3705,16 +3514,15 @@ define void @v_shuffle_v4p3_v4p3__7_5_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 ; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3764,17 +3572,15 @@ define void @v_shuffle_v4p3_v4p3__7_6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3782,16 +3588,15 @@ define void @v_shuffle_v4p3_v4p3__7_6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3799,17 +3604,15 @@ define void @v_shuffle_v4p3_v4p3__7_6_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3824,17 +3627,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3842,17 +3643,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3860,18 +3659,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3885,17 +3681,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3905,14 +3699,12 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3922,15 +3714,12 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3944,18 +3733,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3965,15 +3751,12 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3983,16 +3766,12 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4007,17 +3786,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: v_mov_b32_e32 v6, v1 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4027,15 +3804,13 @@ define void @v_shuffle_v4p3_v4p3__7_7_2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4045,15 +3820,13 @@ define void @v_shuffle_v4p3_v4p3__7_7_2_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v7 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4068,17 +3841,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v8 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4088,15 +3859,13 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4106,15 +3875,13 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: v_mov_b32_e32 v4, v7 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4129,17 +3896,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: v_mov_b32_e32 v6, v1 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4147,17 +3912,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v2 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4165,17 +3928,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v2 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4190,16 +3951,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4207,17 +3967,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v3 -; GFX90A-NEXT: v_mov_b32_e32 v11, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4225,17 +3983,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v10, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4250,16 +4006,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_6_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4311,10 +4066,10 @@ define void @v_shuffle_v4p3_v4p3__u_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4401,11 +4156,10 @@ define void @v_shuffle_v4p3_v4p3__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4447,11 +4201,11 @@ define void @v_shuffle_v4p3_v4p3__2_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4461,11 +4215,11 @@ define void @v_shuffle_v4p3_v4p3__2_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4475,11 +4229,11 @@ define void @v_shuffle_v4p3_v4p3__2_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4492,14 +4246,14 @@ define void @v_shuffle_v4p3_v4p3__3_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__3_2_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4541,10 +4295,10 @@ define void @v_shuffle_v4p3_v4p3__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4584,16 +4338,15 @@ define void @v_shuffle_v4p3_v4p3__5_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4641,16 +4394,15 @@ define void @v_shuffle_v4p3_v4p3__6_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4658,16 +4410,15 @@ define void @v_shuffle_v4p3_v4p3__6_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4675,16 +4426,15 @@ define void @v_shuffle_v4p3_v4p3__6_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4699,16 +4449,15 @@ define void @v_shuffle_v4p3_v4p3__7_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4756,19 +4505,18 @@ define void @v_shuffle_v4p3_v4p3__7_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_u_2_2: +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_u_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -4811,17 +4559,14 @@ define void @v_shuffle_v4p3_v4p3__7_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4868,15 +4613,14 @@ define void @v_shuffle_v4p3_v4p3__7_1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4923,16 +4667,15 @@ define void @v_shuffle_v4p3_v4p3__7_3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4981,16 +4724,15 @@ define void @v_shuffle_v4p3_v4p3__7_4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5037,15 +4779,14 @@ define void @v_shuffle_v4p3_v4p3__7_5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -5095,16 +4836,15 @@ define void @v_shuffle_v4p3_v4p3__7_6_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5151,16 +4891,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5168,16 +4907,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5185,16 +4923,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5209,16 +4946,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5226,16 +4961,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5243,16 +4977,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5267,17 +5000,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v0 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5287,15 +5018,13 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5305,15 +5034,13 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v7 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5328,17 +5055,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5346,16 +5070,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[6:7] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5363,16 +5086,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[6:7] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5393,11 +5115,9 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v2 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5405,16 +5125,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5422,16 +5141,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5446,54 +5164,49 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v6 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v2 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, v7 -; GFX90A-NEXT: v_mov_b32_e32 v11, v7 -; GFX90A-NEXT: v_mov_b32_e32 v12, v4 -; GFX90A-NEXT: v_mov_b32_e32 v13, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v10, v7 -; GFX942-NEXT: v_mov_b32_e32 v11, v7 -; GFX942-NEXT: v_mov_b32_e32 v12, v4 -; GFX942-NEXT: v_mov_b32_e32 v13, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5508,16 +5221,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v9, 0 -; GFX900-NEXT: v_mov_b32_e32 v5, v6 -; GFX900-NEXT: v_mov_b32_e32 v7, v4 -; GFX900-NEXT: v_mov_b32_e32 v8, v2 -; GFX900-NEXT: global_store_dwordx4 v9, v[5:8], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5525,16 +5237,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5542,17 +5253,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5567,16 +5277,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_6_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5624,39 +5333,40 @@ define void @v_shuffle_v4p3_v4p3__u_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__u_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__u_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__u_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5717,11 +5427,10 @@ define void @v_shuffle_v4p3_v4p3__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5731,11 +5440,11 @@ define void @v_shuffle_v4p3_v4p3__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5745,11 +5454,11 @@ define void @v_shuffle_v4p3_v4p3__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5762,42 +5471,40 @@ define void @v_shuffle_v4p3_v4p3__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__2_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__2_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__2_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5810,14 +5517,14 @@ define void @v_shuffle_v4p3_v4p3__3_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__3_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5827,11 +5534,11 @@ define void @v_shuffle_v4p3_v4p3__3_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5841,11 +5548,11 @@ define void @v_shuffle_v4p3_v4p3__3_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5858,39 +5565,40 @@ define void @v_shuffle_v4p3_v4p3__4_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__4_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__4_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__4_3_3_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5904,16 +5612,15 @@ define void @v_shuffle_v4p3_v4p3__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5923,14 +5630,14 @@ define void @v_shuffle_v4p3_v4p3__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5940,14 +5647,14 @@ define void @v_shuffle_v4p3_v4p3__5_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5962,16 +5669,15 @@ define void @v_shuffle_v4p3_v4p3__6_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5979,16 +5685,15 @@ define void @v_shuffle_v4p3_v4p3__6_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5996,16 +5701,15 @@ define void @v_shuffle_v4p3_v4p3__6_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6019,17 +5723,16 @@ define void @v_shuffle_v4p3_v4p3__7_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6039,14 +5742,14 @@ define void @v_shuffle_v4p3_v4p3__7_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6056,14 +5759,14 @@ define void @v_shuffle_v4p3_v4p3__7_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6077,16 +5780,15 @@ define void @v_shuffle_v4p3_v4p3__7_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_u_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6136,14 +5838,12 @@ define void @v_shuffle_v4p3_v4p3__7_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v0 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6190,15 +5890,14 @@ define void @v_shuffle_v4p3_v4p3__7_1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6244,17 +5943,15 @@ define void @v_shuffle_v4p3_v4p3__7_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_2_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6302,16 +5999,15 @@ define void @v_shuffle_v4p3_v4p3__7_4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6358,16 +6054,15 @@ define void @v_shuffle_v4p3_v4p3__7_5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6416,16 +6111,15 @@ define void @v_shuffle_v4p3_v4p3__7_6_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6472,16 +6166,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6489,16 +6182,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6506,16 +6198,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6530,15 +6221,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6546,15 +6236,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6562,16 +6251,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6586,17 +6274,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6606,15 +6292,13 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6624,15 +6308,13 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v7 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6653,11 +6335,9 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v1 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6667,15 +6347,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 ; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v1 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6685,15 +6364,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v7 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6708,15 +6386,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6724,15 +6401,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6740,16 +6416,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_2_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6764,16 +6439,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6781,16 +6455,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6798,17 +6471,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6823,16 +6495,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6840,16 +6511,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6857,17 +6527,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6882,16 +6551,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_6_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6989,12 +6657,11 @@ define void @v_shuffle_v4p3_v4p3__1_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__1_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7031,36 +6698,33 @@ define void @v_shuffle_v4p3_v4p3__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__2_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__2_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__2_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7073,12 +6737,11 @@ define void @v_shuffle_v4p3_v4p3__3_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__3_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7141,13 +6804,12 @@ define void @v_shuffle_v4p3_v4p3__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7155,13 +6817,12 @@ define void @v_shuffle_v4p3_v4p3__5_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7178,43 +6839,39 @@ define void @v_shuffle_v4p3_v4p3__6_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v0 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__6_4_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__6_4_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7228,15 +6885,14 @@ define void @v_shuffle_v4p3_v4p3__7_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7244,13 +6900,12 @@ define void @v_shuffle_v4p3_v4p3__7_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7258,13 +6913,12 @@ define void @v_shuffle_v4p3_v4p3__7_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7278,43 +6932,39 @@ define void @v_shuffle_v4p3_v4p3__7_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_u_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_u_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_u_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7329,17 +6979,15 @@ define void @v_shuffle_v4p3_v4p3__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7352,11 +7000,10 @@ define void @v_shuffle_v4p3_v4p3__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v6, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7366,15 +7013,14 @@ define void @v_shuffle_v4p3_v4p3__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v6, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7389,15 +7035,15 @@ define void @v_shuffle_v4p3_v4p3__7_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7407,14 +7053,13 @@ define void @v_shuffle_v4p3_v4p3__7_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7424,15 +7069,14 @@ define void @v_shuffle_v4p3_v4p3__7_1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7447,16 +7091,15 @@ define void @v_shuffle_v4p3_v4p3__7_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7464,15 +7107,14 @@ define void @v_shuffle_v4p3_v4p3__7_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[6:7] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -7481,16 +7123,15 @@ define void @v_shuffle_v4p3_v4p3__7_2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[6:7] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -7506,17 +7147,15 @@ define void @v_shuffle_v4p3_v4p3__7_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7526,15 +7165,13 @@ define void @v_shuffle_v4p3_v4p3__7_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7544,15 +7181,13 @@ define void @v_shuffle_v4p3_v4p3__7_3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7566,46 +7201,42 @@ define void @v_shuffle_v4p3_v4p3__7_5_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_5_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 ; GFX900-NEXT: v_mov_b32_e32 v5, v0 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_5_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_5_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7619,14 +7250,14 @@ define void @v_shuffle_v4p3_v4p3__7_6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_6_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7634,13 +7265,12 @@ define void @v_shuffle_v4p3_v4p3__7_6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7648,13 +7278,12 @@ define void @v_shuffle_v4p3_v4p3__7_6_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7668,46 +7297,42 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7721,14 +7346,13 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7769,17 +7393,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7787,17 +7409,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v2 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7805,17 +7425,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v2 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7830,17 +7448,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7848,16 +7464,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7865,17 +7480,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7890,15 +7503,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7906,16 +7519,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7923,17 +7535,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_2_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7948,17 +7559,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7966,16 +7575,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[4:5] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7983,17 +7591,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[4:5] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -8007,14 +7614,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, v0 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8054,15 +7661,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_6_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_6_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: v_mov_b32_e32 v7, v0 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8157,12 +7763,12 @@ define void @v_shuffle_v4p3_v4p3__0_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8216,10 +7822,9 @@ define void @v_shuffle_v4p3_v4p3__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 ; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8275,10 +7880,9 @@ define void @v_shuffle_v4p3_v4p3__2_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 ; GFX900-NEXT: v_mov_b32_e32 v5, v4 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8286,16 +7890,15 @@ define void @v_shuffle_v4p3_v4p3__2_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8303,16 +7906,15 @@ define void @v_shuffle_v4p3_v4p3__2_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -8330,13 +7932,12 @@ define void @v_shuffle_v4p3_v4p3__3_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v5 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8433,11 +8034,11 @@ define void @v_shuffle_v4p3_v4p3__5_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8482,11 +8083,11 @@ define void @v_shuffle_v4p3_v4p3__6_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8496,11 +8097,11 @@ define void @v_shuffle_v4p3_v4p3__6_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8510,11 +8111,11 @@ define void @v_shuffle_v4p3_v4p3__6_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -8528,14 +8129,14 @@ define void @v_shuffle_v4p3_v4p3__7_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_5_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8577,14 +8178,13 @@ define void @v_shuffle_v4p3_v4p3__7_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_u_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8592,13 +8192,12 @@ define void @v_shuffle_v4p3_v4p3__7_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8606,13 +8205,12 @@ define void @v_shuffle_v4p3_v4p3__7_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -8627,17 +8225,15 @@ define void @v_shuffle_v4p3_v4p3__7_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8685,15 +8281,15 @@ define void @v_shuffle_v4p3_v4p3__7_1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8741,17 +8337,15 @@ define void @v_shuffle_v4p3_v4p3__7_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8759,15 +8353,14 @@ define void @v_shuffle_v4p3_v4p3__7_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[6:7] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8776,16 +8369,15 @@ define void @v_shuffle_v4p3_v4p3__7_2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[6:7] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -8801,17 +8393,15 @@ define void @v_shuffle_v4p3_v4p3__7_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8821,15 +8411,13 @@ define void @v_shuffle_v4p3_v4p3__7_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8839,15 +8427,13 @@ define void @v_shuffle_v4p3_v4p3__7_3_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -8861,15 +8447,14 @@ define void @v_shuffle_v4p3_v4p3__7_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_4_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8877,13 +8462,12 @@ define void @v_shuffle_v4p3_v4p3__7_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8891,13 +8475,12 @@ define void @v_shuffle_v4p3_v4p3__7_4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -8926,13 +8509,12 @@ define void @v_shuffle_v4p3_v4p3__7_6_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8940,13 +8522,12 @@ define void @v_shuffle_v4p3_v4p3__7_6_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -8960,45 +8541,42 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -9012,13 +8590,13 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9059,16 +8637,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v5, v0 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9076,17 +8653,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9094,17 +8669,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -9119,16 +8692,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: v_mov_b32_e32 v6, v1 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9136,17 +8708,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v1 -; GFX90A-NEXT: v_mov_b32_e32 v11, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9154,17 +8725,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v1 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -9179,16 +8749,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9196,16 +8765,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_2_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9213,17 +8781,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_2_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -9238,17 +8805,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9256,17 +8821,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9274,17 +8838,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v4, v5 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -9298,45 +8861,42 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -9350,46 +8910,42 @@ define void @v_shuffle_v4p3_v4p3__7_7_6_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_6_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v2 -; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_6_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v2 -; GFX90A-NEXT: v_mov_b32_e32 v9, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_6_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v2 -; GFX942-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -9406,10 +8962,10 @@ define void @v_shuffle_v4p3_v4p3__u_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9514,10 +9070,9 @@ define void @v_shuffle_v4p3_v4p3__1_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v5, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9571,10 +9126,9 @@ define void @v_shuffle_v4p3_v4p3__2_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: v_mov_b32_e32 v6, v5 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9582,16 +9136,16 @@ define void @v_shuffle_v4p3_v4p3__2_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 ; GFX90A-NEXT: v_mov_b32_e32 v5, v6 -; GFX90A-NEXT: v_mov_b32_e32 v7, v6 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9599,16 +9153,16 @@ define void @v_shuffle_v4p3_v4p3__2_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: v_mov_b32_e32 v5, v6 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v7, v6 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -9626,13 +9180,12 @@ define void @v_shuffle_v4p3_v4p3__3_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v6 -; GFX900-NEXT: v_mov_b32_e32 v7, v6 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9728,11 +9281,10 @@ define void @v_shuffle_v4p3_v4p3__5_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9775,11 +9327,11 @@ define void @v_shuffle_v4p3_v4p3__6_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9789,11 +9341,11 @@ define void @v_shuffle_v4p3_v4p3__6_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9803,11 +9355,11 @@ define void @v_shuffle_v4p3_v4p3__6_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -9821,14 +9373,14 @@ define void @v_shuffle_v4p3_v4p3__7_6_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_6_6_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9868,13 +9420,13 @@ define void @v_shuffle_v4p3_v4p3__7_u_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_u_6_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9915,16 +9467,15 @@ define void @v_shuffle_v4p3_v4p3__7_0_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9972,16 +9523,15 @@ define void @v_shuffle_v4p3_v4p3__7_1_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10031,15 +9581,14 @@ define void @v_shuffle_v4p3_v4p3__7_2_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 ; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -10088,16 +9637,15 @@ define void @v_shuffle_v4p3_v4p3__7_3_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v6 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10105,16 +9653,16 @@ define void @v_shuffle_v4p3_v4p3__7_3_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v6 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10122,16 +9670,16 @@ define void @v_shuffle_v4p3_v4p3__7_3_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v7 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v6 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v6 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -10192,13 +9740,14 @@ define void @v_shuffle_v4p3_v4p3__7_5_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_5_6_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10238,14 +9787,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_6_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10287,14 +9836,13 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_u_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10337,17 +9885,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, v4 -; GFX900-NEXT: v_mov_b32_e32 v6, v4 -; GFX900-NEXT: v_mov_b32_e32 v7, v0 -; GFX900-NEXT: v_mov_b32_e32 v8, v3 -; GFX900-NEXT: global_store_dwordx4 v9, v[5:8], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10355,17 +9901,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, v5 -; GFX90A-NEXT: v_mov_b32_e32 v9, v5 -; GFX90A-NEXT: v_mov_b32_e32 v10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v11, v4 -; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10373,17 +9918,17 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, v0 -; GFX942-NEXT: v_mov_b32_e32 v8, v5 -; GFX942-NEXT: v_mov_b32_e32 v9, v5 -; GFX942-NEXT: v_mov_b32_e32 v11, v4 -; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v7 +; GFX942-NEXT: v_mov_b32_e32 v1, v7 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -10398,17 +9943,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v5 -; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v8, v1 -; GFX900-NEXT: v_mov_b32_e32 v9, v4 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10416,16 +9959,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10433,17 +9975,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -10458,16 +9998,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_2_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10517,17 +10056,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10535,16 +10072,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[6:7] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10552,17 +10088,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_6(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[6:7] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -10576,46 +10111,42 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_4_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v2 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_6: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -10629,15 +10160,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_5_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v1 -; GFX900-NEXT: v_mov_b32_e32 v7, v2 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10677,39 +10207,40 @@ define void @v_shuffle_v4p3_v4p3__u_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__u_7_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__u_7_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__u_7_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -10789,10 +10320,10 @@ define void @v_shuffle_v4p3_v4p3__1_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10800,16 +10331,16 @@ define void @v_shuffle_v4p3_v4p3__1_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10817,17 +10348,16 @@ define void @v_shuffle_v4p3_v4p3__1_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -10848,10 +10378,10 @@ define void @v_shuffle_v4p3_v4p3__2_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v6 -; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10859,16 +10389,16 @@ define void @v_shuffle_v4p3_v4p3__2_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 ; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10876,16 +10406,16 @@ define void @v_shuffle_v4p3_v4p3__2_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v7 +; GFX942-NEXT: v_mov_b32_e32 v4, v7 ; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_mov_b32_e32 v6, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -10903,13 +10433,13 @@ define void @v_shuffle_v4p3_v4p3__3_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v6, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10917,16 +10447,16 @@ define void @v_shuffle_v4p3_v4p3__3_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 ; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10934,16 +10464,17 @@ define void @v_shuffle_v4p3_v4p3__3_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, v7 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 ; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -11006,11 +10537,10 @@ define void @v_shuffle_v4p3_v4p3__5_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11020,11 +10550,11 @@ define void @v_shuffle_v4p3_v4p3__5_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11034,11 +10564,11 @@ define void @v_shuffle_v4p3_v4p3__5_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -11052,42 +10582,40 @@ define void @v_shuffle_v4p3_v4p3__6_7_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__6_7_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__6_7_7_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__6_7_7_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -11101,13 +10629,13 @@ define void @v_shuffle_v4p3_v4p3__7_u_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_u_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11148,16 +10676,15 @@ define void @v_shuffle_v4p3_v4p3__7_0_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11205,16 +10732,15 @@ define void @v_shuffle_v4p3_v4p3__7_1_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11264,15 +10790,14 @@ define void @v_shuffle_v4p3_v4p3__7_2_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 ; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -11321,16 +10846,15 @@ define void @v_shuffle_v4p3_v4p3__7_3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11338,16 +10862,16 @@ define void @v_shuffle_v4p3_v4p3__7_3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 ; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11355,16 +10879,16 @@ define void @v_shuffle_v4p3_v4p3__7_3_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v7 ; GFX942-NEXT: v_mov_b32_e32 v4, v7 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v6, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v7 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -11378,15 +10902,14 @@ define void @v_shuffle_v4p3_v4p3__7_4_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_4_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11426,13 +10949,14 @@ define void @v_shuffle_v4p3_v4p3__7_5_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_5_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11472,14 +10996,14 @@ define void @v_shuffle_v4p3_v4p3__7_6_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_6_7_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11519,13 +11043,13 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_u_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11536,9 +11060,9 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11549,9 +11073,9 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -11566,16 +11090,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[5:8] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v9, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11583,16 +11106,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11600,17 +11122,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -11625,16 +11145,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11642,16 +11161,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11659,17 +11178,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:5] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -11684,16 +11202,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_2_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v6 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11701,16 +11218,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_2_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11718,17 +11234,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_2_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: v_mov_b32_e32 v3, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -11743,16 +11258,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v7 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11760,16 +11274,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v7 -; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11777,16 +11291,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_7(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, v7 -; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -11800,46 +11314,42 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_4_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v0 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -11853,46 +11363,42 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_5_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v6, v1 -; GFX900-NEXT: v_mov_b32_e32 v7, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_7: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, v3 -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v8, v1 -; GFX90A-NEXT: v_mov_b32_e32 v9, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_5_7: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v6, v3 -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-NEXT: v_mov_b32_e32 v9, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -11906,13 +11412,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_6_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_6_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/vector-legalizer-divergence.ll b/llvm/test/CodeGen/AMDGPU/vector-legalizer-divergence.ll index bb0b661e800c3..8c634934947a4 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-legalizer-divergence.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-legalizer-divergence.ll @@ -10,18 +10,17 @@ define amdgpu_kernel void @spam(ptr addrspace(1) noalias %arg) { ; CHECK-LABEL: spam: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; CHECK-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; CHECK-NEXT: v_mov_b32_e32 v5, 0 +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_mov_b32 s3, 0xf000 ; CHECK-NEXT: s_mov_b32 s2, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; CHECK-NEXT: v_mov_b32_e32 v0, v5 +; CHECK-NEXT: v_mov_b32_e32 v2, 0x7ff80000 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 offset:16 +; CHECK-NEXT: buffer_store_dwordx4 v[1:4], v[0:1], s[0:3], 0 addr64 offset:16 ; CHECK-NEXT: s_waitcnt expcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, v5 -; CHECK-NEXT: v_mov_b32_e32 v3, v5 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 offset:48 +; CHECK-NEXT: v_mov_b32_e32 v3, v1 +; CHECK-NEXT: v_mov_b32_e32 v4, v1 +; CHECK-NEXT: buffer_store_dwordx4 v[1:4], v[0:1], s[0:3], 0 addr64 offset:48 ; CHECK-NEXT: s_endpgm %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 diff --git a/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll b/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll index 1a8f198ecf70a..69f6c38d55a2d 100644 --- a/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll @@ -54,27 +54,24 @@ define amdgpu_kernel void @widen_vselect_and_mask_v4i64(<4 x i64> %arg) #0 { ; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[4:5], 0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b64 s[8:9], 16 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, -1 -; GCN-NEXT: v_mov_b32_e32 v1, v0 -; GCN-NEXT: v_mov_b32_e32 v2, v0 -; GCN-NEXT: v_mov_b32_e32 v3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: v_mov_b32_e32 v4, v1 ; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[0:1], 0 -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[2:3] +; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[2:3] ; GCN-NEXT: v_cmp_ne_u64_e64 s[0:1], s[0:1], 0 -; GCN-NEXT: v_mov_b32_e32 v5, v4 -; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5] +; GCN-NEXT: v_mov_b32_e32 v6, v5 +; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[5:6] ; GCN-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NEXT: v_mov_b32_e32 v5, v0 -; GCN-NEXT: v_mov_b32_e32 v6, v0 -; GCN-NEXT: v_mov_b32_e32 v7, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GCN-NEXT: buffer_store_dwordx4 v[1:4], off, s[8:11], 0 ; GCN-NEXT: s_mov_b32 s6, s10 ; GCN-NEXT: s_mov_b32 s7, s11 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm bb: %tmp = extractelement <4 x i64> %arg, i64 0 From 75ac548ded6ecafccf74116b7389790ac8ef855e Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 4 Nov 2025 15:39:15 -0800 Subject: [PATCH 2/3] Rename test --- .../{limit-coalesce.mir => no-limit-coalesce.mir} | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) rename llvm/test/CodeGen/AMDGPU/{limit-coalesce.mir => no-limit-coalesce.mir} (86%) diff --git a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir b/llvm/test/CodeGen/AMDGPU/no-limit-coalesce.mir similarity index 86% rename from llvm/test/CodeGen/AMDGPU/limit-coalesce.mir rename to llvm/test/CodeGen/AMDGPU/no-limit-coalesce.mir index a245c475638f2..934a536edb726 100644 --- a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir +++ b/llvm/test/CodeGen/AMDGPU/no-limit-coalesce.mir @@ -1,25 +1,22 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 # RUN: llc -mtriple=amdgcn -run-pass register-coalescer -o - %s | FileCheck %s -# Check that coalescer does not create wider register tuple than in -# source. -# No more registers shall be defined +# Check that coalescer may create wider register tuple than in source. --- -name: limit_coalesce +name: no_limit_coalesce tracksRegLiveness: true body: | bb.0: liveins: $sgpr16, $sgpr17 - ; CHECK-LABEL: name: limit_coalesce + ; CHECK-LABEL: name: no_limit_coalesce ; CHECK: liveins: $sgpr16, $sgpr17 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub1:sgpr_64 = COPY $sgpr17 ; CHECK-NEXT: [[COPY:%[0-9]+]].sub0:sgpr_64 = COPY $sgpr16 ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: INLINEASM &"; def $0", 0 /* attdialect */, 2818058 /* regdef:VReg_64 */, def %4 - ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_128 = COPY %4.sub1 - ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_]], [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK-NEXT: INLINEASM &"; def $0", 0 /* attdialect */, 2818058 /* regdef:VReg_64 */, def undef %5.sub0_sub1 + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_]], %5.sub1_sub2_sub3_sub4, [[COPY]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:sgpr_32 = COPY killed $sgpr17 %1:sgpr_32 = COPY killed $sgpr16 @@ -52,7 +49,6 @@ body: | ; CHECK-NEXT: FLAT_STORE_DWORDX3 $vgpr0_vgpr1, [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_128 = IMPLICIT_DEF ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1_sub2:vreg_128 = COPY [[DEF1]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub3:vreg_128 = COPY undef [[COPY]].sub2 ; CHECK-NEXT: FLAT_STORE_DWORDX4 $vgpr0_vgpr1, [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr %2:vreg_64 = IMPLICIT_DEF undef %3.sub0:vreg_64 = COPY $sgpr0 From ddce26ba219ad38569b8ae3f6b8fbf1365f4ecc3 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 20 Nov 2025 21:04:24 -0500 Subject: [PATCH 3/3] regression --- llvm/test/CodeGen/AMDGPU/collapse-endcf.ll | 41 +- .../test/CodeGen/AMDGPU/lds-misaligned-bug.ll | 94 ++-- llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 408 +++++++++--------- 3 files changed, 284 insertions(+), 259 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index 447da3d26f793..ce541dd2954f4 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -979,7 +979,7 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GCN-NEXT: s_mov_b64 s[8:9], 0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: s_branch .LBB5_3 ; GCN-NEXT: .LBB5_1: ; %Flow @@ -1002,36 +1002,45 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-NEXT: ; %bb.4: ; %bb2 ; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-NEXT: v_mov_b32_e32 v1, v0 -; GCN-NEXT: v_mov_b32_e32 v2, v0 -; GCN-NEXT: v_mov_b32_e32 v3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: v_mov_b32_e32 v4, v1 +; GCN-NEXT: v_mov_b32_e32 v5, v4 +; GCN-NEXT: v_mov_b32_e32 v4, v3 +; GCN-NEXT: v_mov_b32_e32 v3, v2 +; GCN-NEXT: v_mov_b32_e32 v2, v1 ; GCN-NEXT: s_and_saveexec_b64 s[10:11], s[4:5] ; GCN-NEXT: s_cbranch_execz .LBB5_2 ; GCN-NEXT: ; %bb.5: ; %bb4 ; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 -; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v1, v0 -; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: v_mov_b32_e32 v4, v1 +; GCN-NEXT: v_mov_b32_e32 v5, v4 +; GCN-NEXT: v_mov_b32_e32 v4, v3 +; GCN-NEXT: v_mov_b32_e32 v3, v2 +; GCN-NEXT: v_mov_b32_e32 v2, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_gt_f32_e64 s[6:7], 0, v3 -; GCN-NEXT: v_mov_b32_e32 v3, v0 +; GCN-NEXT: v_cmp_gt_f32_e64 s[6:7], 0, v0 ; GCN-NEXT: s_and_saveexec_b64 s[12:13], s[6:7] ; GCN-NEXT: s_cbranch_execz .LBB5_1 ; GCN-NEXT: ; %bb.6: ; %bb8 ; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 -; GCN-NEXT: v_mov_b32_e32 v1, v0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: v_mov_b32_e32 v5, v3 +; GCN-NEXT: v_mov_b32_e32 v4, v2 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: s_branch .LBB5_1 ; GCN-NEXT: .LBB5_7: ; %bb12 ; GCN-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll index fa0568d307907..3d79bdc25336d 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll @@ -462,12 +462,13 @@ define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) { ; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_add_co_u32 v4, s0, s0, v0 -; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, s1, 0, s0 -; ALIGNED-GFX10-NEXT: flat_load_dwordx3 v[1:3], v[4:5] +; ALIGNED-GFX10-NEXT: v_add_co_u32 v5, s0, s0, v0 +; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s1, 0, s0 +; ALIGNED-GFX10-NEXT: flat_load_dwordx3 v[0:2], v[5:6] ; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v0, v3 -; ALIGNED-GFX10-NEXT: flat_store_dwordx3 v[4:5], v[0:2] +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0 +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1 +; ALIGNED-GFX10-NEXT: flat_store_dwordx3 v[5:6], v[2:4] ; ALIGNED-GFX10-NEXT: s_endpgm ; ; UNALIGNED-GFX10-LABEL: test_flat_misaligned_v3: @@ -475,12 +476,13 @@ define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) { ; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_add_co_u32 v4, s0, s0, v0 -; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, s1, 0, s0 -; UNALIGNED-GFX10-NEXT: flat_load_dwordx3 v[1:3], v[4:5] +; UNALIGNED-GFX10-NEXT: v_add_co_u32 v5, s0, s0, v0 +; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s1, 0, s0 +; UNALIGNED-GFX10-NEXT: flat_load_dwordx3 v[0:2], v[5:6] ; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v0, v3 -; UNALIGNED-GFX10-NEXT: flat_store_dwordx3 v[4:5], v[0:2] +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0 +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1 +; UNALIGNED-GFX10-NEXT: flat_store_dwordx3 v[5:6], v[2:4] ; UNALIGNED-GFX10-NEXT: s_endpgm ; ; ALIGNED-GFX11-LABEL: test_flat_misaligned_v3: @@ -490,13 +492,13 @@ define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) { ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_add_co_u32 v4, s0, s0, v0 +; ALIGNED-GFX11-NEXT: v_add_co_u32 v5, s0, s0, v0 ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, s1, 0, s0 -; ALIGNED-GFX11-NEXT: flat_load_b96 v[1:3], v[4:5] +; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s1, 0, s0 +; ALIGNED-GFX11-NEXT: flat_load_b96 v[0:2], v[5:6] ; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v0, v3 -; ALIGNED-GFX11-NEXT: flat_store_b96 v[4:5], v[0:2] +; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; ALIGNED-GFX11-NEXT: flat_store_b96 v[5:6], v[2:4] ; ALIGNED-GFX11-NEXT: s_endpgm ; ; UNALIGNED-GFX11-LABEL: test_flat_misaligned_v3: @@ -506,13 +508,13 @@ define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) { ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_add_co_u32 v4, s0, s0, v0 +; UNALIGNED-GFX11-NEXT: v_add_co_u32 v5, s0, s0, v0 ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, s1, 0, s0 -; UNALIGNED-GFX11-NEXT: flat_load_b96 v[1:3], v[4:5] +; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s1, 0, s0 +; UNALIGNED-GFX11-NEXT: flat_load_b96 v[0:2], v[5:6] ; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v0, v3 -; UNALIGNED-GFX11-NEXT: flat_store_b96 v[4:5], v[0:2] +; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; UNALIGNED-GFX11-NEXT: flat_store_b96 v[5:6], v[2:4] ; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -604,33 +606,36 @@ define amdgpu_kernel void @test_local_aligned_v3(ptr addrspace(3) %arg) { ; SPLIT: ; %bb.0: ; %bb ; SPLIT-NEXT: s_load_dword s0, s[4:5], 0x24 ; SPLIT-NEXT: s_waitcnt lgkmcnt(0) -; SPLIT-NEXT: v_lshl_add_u32 v4, v0, 2, s0 -; SPLIT-NEXT: ds_read_b96 v[1:3], v4 +; SPLIT-NEXT: v_lshl_add_u32 v5, v0, 2, s0 +; SPLIT-NEXT: ds_read_b96 v[0:2], v5 ; SPLIT-NEXT: s_waitcnt lgkmcnt(0) -; SPLIT-NEXT: v_mov_b32_e32 v0, v3 -; SPLIT-NEXT: ds_write_b96 v4, v[0:2] +; SPLIT-NEXT: v_mov_b32_e32 v3, v0 +; SPLIT-NEXT: v_mov_b32_e32 v4, v1 +; SPLIT-NEXT: ds_write_b96 v5, v[2:4] ; SPLIT-NEXT: s_endpgm ; ; ALIGNED-GFX10-LABEL: test_local_aligned_v3: ; ALIGNED-GFX10: ; %bb.0: ; %bb ; ALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_lshl_add_u32 v4, v0, 2, s0 -; ALIGNED-GFX10-NEXT: ds_read_b96 v[1:3], v4 +; ALIGNED-GFX10-NEXT: v_lshl_add_u32 v5, v0, 2, s0 +; ALIGNED-GFX10-NEXT: ds_read_b96 v[0:2], v5 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v0, v3 -; ALIGNED-GFX10-NEXT: ds_write_b96 v4, v[0:2] +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0 +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1 +; ALIGNED-GFX10-NEXT: ds_write_b96 v5, v[2:4] ; ALIGNED-GFX10-NEXT: s_endpgm ; ; UNALIGNED-GFX10-LABEL: test_local_aligned_v3: ; UNALIGNED-GFX10: ; %bb.0: ; %bb ; UNALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_lshl_add_u32 v4, v0, 2, s0 -; UNALIGNED-GFX10-NEXT: ds_read_b96 v[1:3], v4 +; UNALIGNED-GFX10-NEXT: v_lshl_add_u32 v5, v0, 2, s0 +; UNALIGNED-GFX10-NEXT: ds_read_b96 v[0:2], v5 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v0, v3 -; UNALIGNED-GFX10-NEXT: ds_write_b96 v4, v[0:2] +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0 +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1 +; UNALIGNED-GFX10-NEXT: ds_write_b96 v5, v[2:4] ; UNALIGNED-GFX10-NEXT: s_endpgm ; ; ALIGNED-GFX11-LABEL: test_local_aligned_v3: @@ -639,11 +644,11 @@ define amdgpu_kernel void @test_local_aligned_v3(ptr addrspace(3) %arg) { ; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_lshl_add_u32 v4, v0, 2, s0 -; ALIGNED-GFX11-NEXT: ds_load_b96 v[1:3], v4 +; ALIGNED-GFX11-NEXT: v_lshl_add_u32 v5, v0, 2, s0 +; ALIGNED-GFX11-NEXT: ds_load_b96 v[0:2], v5 ; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v0, v3 -; ALIGNED-GFX11-NEXT: ds_store_b96 v4, v[0:2] +; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; ALIGNED-GFX11-NEXT: ds_store_b96 v5, v[2:4] ; ALIGNED-GFX11-NEXT: s_endpgm ; ; UNALIGNED-GFX11-LABEL: test_local_aligned_v3: @@ -652,11 +657,11 @@ define amdgpu_kernel void @test_local_aligned_v3(ptr addrspace(3) %arg) { ; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_lshl_add_u32 v4, v0, 2, s0 -; UNALIGNED-GFX11-NEXT: ds_load_b96 v[1:3], v4 +; UNALIGNED-GFX11-NEXT: v_lshl_add_u32 v5, v0, 2, s0 +; UNALIGNED-GFX11-NEXT: ds_load_b96 v[0:2], v5 ; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v0, v3 -; UNALIGNED-GFX11-NEXT: ds_store_b96 v4, v[0:2] +; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; UNALIGNED-GFX11-NEXT: ds_store_b96 v5, v[2:4] ; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -856,12 +861,11 @@ define amdgpu_kernel void @test_local_v4_aligned8(ptr addrspace(3) %arg) { ; SPLIT-NEXT: s_load_dword s0, s[4:5], 0x24 ; SPLIT-NEXT: s_waitcnt lgkmcnt(0) ; SPLIT-NEXT: v_lshl_add_u32 v6, v0, 2, s0 -; SPLIT-NEXT: ds_read2_b64 v[0:3], v6 offset1:1 +; SPLIT-NEXT: ds_read2_b64 v[1:4], v6 offset1:1 ; SPLIT-NEXT: s_waitcnt lgkmcnt(0) -; SPLIT-NEXT: v_mov_b32_e32 v4, v1 -; SPLIT-NEXT: v_mov_b32_e32 v5, v0 -; SPLIT-NEXT: v_mov_b32_e32 v1, v3 -; SPLIT-NEXT: ds_write2_b64 v6, v[1:2], v[4:5] offset1:1 +; SPLIT-NEXT: v_mov_b32_e32 v0, v2 +; SPLIT-NEXT: v_mov_b32_e32 v5, v3 +; SPLIT-NEXT: ds_write2_b64 v6, v[4:5], v[0:1] offset1:1 ; SPLIT-NEXT: s_endpgm ; ; ALIGNED-GFX10-LABEL: test_local_v4_aligned8: diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 9fdc72f054f90..a7111659f898c 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -6473,85 +6473,84 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-LABEL: constant_zextload_v16i1_to_v16i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, v3 +; GFX8-NEXT: v_mov_b32_e32 v9, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v11, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10009 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x1000d -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x10007 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x10003 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x10001 -; GFX8-NEXT: s_and_b32 s8, s2, 1 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x10002 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x10004 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x10006 -; GFX8-NEXT: s_bfe_u32 s12, s2, 0x1000c +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x10009 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x1000d +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x10007 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x10003 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x10001 +; GFX8-NEXT: s_and_b32 s9, s2, 1 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x10002 +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x10004 +; GFX8-NEXT: s_bfe_u32 s12, s2, 0x10006 +; GFX8-NEXT: s_bfe_u32 s13, s2, 0x1000c ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x1000a -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_bfe_u32 v4, v2, 11, 1 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 15, v2 +; GFX8-NEXT: v_bfe_u32 v12, v2, 5, 1 +; GFX8-NEXT: v_bfe_u32 v6, v2, 14, 1 +; GFX8-NEXT: v_bfe_u32 v0, v2, 8, 1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x50 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v13, s3 -; GFX8-NEXT: v_mov_b32_e32 v12, s2 +; GFX8-NEXT: v_mov_b32_e32 v11, s3 +; GFX8-NEXT: v_mov_b32_e32 v10, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 64 -; GFX8-NEXT: v_bfe_u32 v2, v4, 11, 1 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v13, s3 -; GFX8-NEXT: v_mov_b32_e32 v12, s2 +; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[2:5] +; GFX8-NEXT: v_mov_b32_e32 v11, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v10, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x70 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 15, v4 -; GFX8-NEXT: v_bfe_u32 v14, v4, 5, 1 -; GFX8-NEXT: v_bfe_u32 v8, v4, 14, 1 -; GFX8-NEXT: v_bfe_u32 v4, v4, 8, 1 +; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[0:3] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[4:7] -; GFX8-NEXT: v_mov_b32_e32 v0, s12 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0x60 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[6:9] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[8:11] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s13 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 48 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: v_mov_b32_e32 v0, s11 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s11 +; GFX8-NEXT: v_mov_b32_e32 v4, v12 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NEXT: v_mov_b32_e32 v2, v14 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s9 -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mov_b32_e32 v2, s7 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v16i1_to_v16i64: @@ -6640,57 +6639,58 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX12-LABEL: constant_zextload_v16i1_to_v16i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] +; GFX12-NEXT: global_load_u16 v0, v3, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v4, 0xffff, v0 -; GFX12-NEXT: v_mov_b32_e32 v11, v1 +; GFX12-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_and_b32 v6, 0xffff, v0 +; GFX12-NEXT: v_mov_b32_e32 v9, v3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000a ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3 -; GFX12-NEXT: v_bfe_u32 v2, v4, 11, 1 -; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000d +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v2, s3 +; GFX12-NEXT: v_bfe_u32 v4, v6, 11, 1 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10009 +; GFX12-NEXT: v_mov_b32_e32 v1, v3 +; GFX12-NEXT: v_bfe_u32 v0, v6, 8, 1 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1000c -; GFX12-NEXT: v_mov_b32_e32 v5, v1 -; GFX12-NEXT: v_bfe_u32 v6, v4, 5, 1 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:80 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000d +; GFX12-NEXT: v_lshrrev_b32_e32 v8, 15, v6 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v4, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10007 +; GFX12-NEXT: global_store_b128 v3, v[0:3], s[0:1] offset:64 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10006 -; GFX12-NEXT: v_mov_b32_e32 v9, v1 -; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10002 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96 +; GFX12-NEXT: v_bfe_u32 v0, v6, 5, 1 +; GFX12-NEXT: v_bfe_u32 v6, v6, 14, 1 +; GFX12-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:96 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: v_mov_b32_e32 v4, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10004 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10002 +; GFX12-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:48 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 -; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10004 -; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10009 -; GFX12-NEXT: s_bfe_u32 s5, s2, 0x10001 -; GFX12-NEXT: v_lshrrev_b32_e32 v10, 15, v4 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48 +; GFX12-NEXT: v_mov_b32_e32 v4, v0 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10003 +; GFX12-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:32 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: v_mov_b32_e32 v2, v6 -; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10003 +; GFX12-NEXT: v_mov_b32_e32 v4, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10001 ; GFX12-NEXT: s_and_b32 s2, s2, 1 -; GFX12-NEXT: v_bfe_u32 v8, v4, 14, 1 -; GFX12-NEXT: v_bfe_u32 v4, v4, 8, 1 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:16 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: v_mov_b32_e32 v6, s3 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 -; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: v_mov_b32_e32 v2, s5 -; GFX12-NEXT: s_clause 0x2 -; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:112 -; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:64 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: v_mov_b32_e32 v4, s3 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v3, v[6:9], s[0:1] offset:112 +; GFX12-NEXT: global_store_b128 v3, v[2:5], s[0:1] ; GFX12-NEXT: s_endpgm ; ; GFX1250-LABEL: constant_zextload_v16i1_to_v16i64: @@ -9663,169 +9663,181 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10014 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX1250-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s4 ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10015 ; GFX1250-NEXT: s_lshr_b32 s4, s3, 31 -; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1 +; GFX1250-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, v3 ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1001e -; GFX1250-NEXT: s_bfe_u32 s6, s2, 0x10004 -; GFX1250-NEXT: s_and_b32 s7, s2, 1 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:416 +; GFX1250-NEXT: s_bfe_u32 s6, s3, 0x10002 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:416 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 +; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 ; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x1001d ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1001c -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:496 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:496 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 +; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 ; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x1001b ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1001a -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:480 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:480 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 +; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 ; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10019 ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10018 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:464 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:464 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 +; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 ; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10017 ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10016 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:448 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:448 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 +; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 ; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10013 ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10012 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:432 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:432 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 +; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 ; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10011 ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10010 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:400 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:400 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 +; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 ; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x1000f ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1000e -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:384 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:384 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 +; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 ; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x1000d ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1000c -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:368 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:368 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 +; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 ; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x1000b ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1000a -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:352 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:352 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 +; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 ; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10009 ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10008 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:336 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:336 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 +; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 ; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10007 ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10006 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:320 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:320 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 +; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 ; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10005 ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10004 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:304 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:304 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10003 -; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10002 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:288 +; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 +; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10003 +; GFX1250-NEXT: s_mov_b32 s4, s3 +; GFX1250-NEXT: s_bfe_u32 s3, s3, 0x10001 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:288 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_lshr_b32 s4, s2, 31 -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1001e -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:272 +; GFX1250-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v4, s5 +; GFX1250-NEXT: s_and_b64 s[4:5], s[4:5], 1 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:272 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1001d -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1001c -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:240 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: s_lshr_b32 s3, s2, 31 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1001e +; GFX1250-NEXT: v_mov_b32_e32 v4, s3 +; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x1001d +; GFX1250-NEXT: global_store_b128 v3, v[0:3], s[0:1] offset:256 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1001b -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1001a -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:224 +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1001c +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:240 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10019 -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10018 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:208 +; GFX1250-NEXT: v_mov_b32_e32 v4, s3 +; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x1001b +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1001a +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:224 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10017 -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10016 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192 +; GFX1250-NEXT: v_mov_b32_e32 v4, s3 +; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10019 +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10018 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:208 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10014 -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10015 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176 +; GFX1250-NEXT: v_mov_b32_e32 v4, s3 +; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10017 +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10016 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:192 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s5 -; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10013 -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10012 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160 +; GFX1250-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v4, s3 +; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10014 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10015 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:176 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10011 -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10010 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144 +; GFX1250-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v4, s4 +; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10013 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10012 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:160 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000f -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1000e -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128 +; GFX1250-NEXT: v_mov_b32_e32 v4, s3 +; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10011 +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10010 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:144 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000d -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1000c -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112 +; GFX1250-NEXT: v_mov_b32_e32 v4, s3 +; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x1000f +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000e +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:128 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000b -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1000a -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96 +; GFX1250-NEXT: v_mov_b32_e32 v4, s3 +; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x1000d +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000c +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:112 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10009 -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10008 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80 +; GFX1250-NEXT: v_mov_b32_e32 v4, s3 +; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x1000b +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000a +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:96 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10007 -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10006 -; GFX1250-NEXT: v_mov_b32_e32 v7, v1 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64 +; GFX1250-NEXT: v_mov_b32_e32 v4, s3 +; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10009 +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10008 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:80 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_mov_b32 s4, s3 -; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10001 +; GFX1250-NEXT: v_mov_b32_e32 v4, s3 +; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10007 +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10006 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:64 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v4, s3 ; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10005 -; GFX1250-NEXT: v_mov_b32_e32 v6, s5 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48 +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10004 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:48 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v4, s3 ; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10003 -; GFX1250-NEXT: s_bfe_u32 s6, s2, 0x10001 -; GFX1250-NEXT: s_bfe_u32 s2, s2, 0x10002 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32 +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10002 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:32 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s3 -; GFX1250-NEXT: s_and_b64 s[2:3], s[4:5], 1 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v4, s3 +; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10001 +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: s_and_b32 s2, s2, 1 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:16 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v2, s6 -; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:256 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX1250-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v4, s3 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] ; GFX1250-NEXT: s_endpgm %load = load <64 x i1>, ptr addrspace(4) %in %ext = zext <64 x i1> %load to <64 x i64>